Text Generation
Transformers
PyTorch
English
custom-architecture
rope
rmsnorm
swiglu
flash-attention
16k-context
Eval Results (legacy)
Instructions to use Austin207/Map-NEO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Austin207/Map-NEO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Austin207/Map-NEO")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Austin207/Map-NEO", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Austin207/Map-NEO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Austin207/Map-NEO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Austin207/Map-NEO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/Austin207/Map-NEO
- SGLang
How to use Austin207/Map-NEO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Austin207/Map-NEO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Austin207/Map-NEO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Austin207/Map-NEO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Austin207/Map-NEO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use Austin207/Map-NEO with Docker Model Runner:
docker model run hf.co/Austin207/Map-NEO
| # MAP-NEO Mini: Data Preprocessing Pipeline | |
| # Downloads Matrix dataset, filters to English, tokenizes and packs sequences | |
| import json | |
| import os | |
| import itertools | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| import langdetect | |
| from tqdm import tqdm | |
| import argparse | |
| class DataPreprocessor: | |
| def __init__(self, output_dir="data", seq_length=1024): | |
| self.output_dir = Path(output_dir) | |
| self.seq_length = seq_length | |
| self.setup_directories() | |
| def setup_directories(self): | |
| """Create necessary directories""" | |
| dirs = ["shards", "processed", "tokens"] | |
| for d in dirs: | |
| (self.output_dir / d).mkdir(parents=True, exist_ok=True) | |
| def download_refinedweb_sample(self, num_docs=100000): | |
| """Download a sample from RefinedWeb dataset""" | |
| print(f"Downloading {num_docs} documents from RefinedWeb...") | |
| raw_path = self.output_dir / "shards" / "refinedweb_sample_raw.jsonl" | |
| try: | |
| # Load RefinedWeb dataset | |
| ds = load_dataset("tiiuae/falcon-refinedweb", split="train", streaming=True) | |
| downloaded = 0 | |
| with open(raw_path, "w", encoding="utf-8") as f: | |
| for row in tqdm(itertools.islice(ds, num_docs), total=num_docs): | |
| # RefinedWeb has 'content' field instead of 'text' | |
| text = row.get("content", "").strip() | |
| if text and len(text) > 100: # Quality filter | |
| f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n") | |
| downloaded += 1 | |
| if downloaded >= num_docs: | |
| break | |
| print(f"Raw RefinedWeb data saved to: {raw_path}") | |
| print(f"Downloaded {downloaded} high-quality documents") | |
| return raw_path | |
| except Exception as e: | |
| print(f"Error downloading RefinedWeb: {e}") | |
| print("Falling back to Matrix dataset...") | |
| return self.download_matrix_sample_fallback(num_docs) | |
| def download_matrix_sample_fallback(self, num_docs=10000): | |
| """Download a sample from MAP-NEO Matrix dataset""" | |
| print(f"Downloading {num_docs} documents from Matrix dataset...") | |
| raw_path = self.output_dir / "shards" / "matrix_sample_raw.jsonl" | |
| ds = load_dataset("m-a-p/Matrix", split="train", streaming=True) | |
| with open(raw_path, "w", encoding="utf-8") as f: | |
| for i, row in enumerate(tqdm(itertools.islice(ds, num_docs), total=num_docs)): | |
| text = row.get("text") or row.get("content") or "" | |
| if text.strip(): | |
| f.write(json.dumps({"text": text}, ensure_ascii=False) + "\n") | |
| print(f"Raw data saved to: {raw_path}") | |
| return raw_path | |
| # def filter_english(self, input_path): | |
| # """Filter documents to English only""" | |
| # print("Filtering documents to English only...") | |
| # input_path = Path(input_path) | |
| # output_path = self.output_dir / "processed" / "matrix_english.jsonl" | |
| # english_count = 0 | |
| # total_count = 0 | |
| # with open(input_path, "r", encoding="utf-8") as infile, \ | |
| # open(output_path, "w", encoding="utf-8") as outfile: | |
| # for line in tqdm(infile): | |
| # total_count += 1 | |
| # try: | |
| # obj = json.loads(line) | |
| # text = obj["text"] | |
| # # Skip very short texts | |
| # if len(text) < 50: | |
| # continue | |
| # # Detect language | |
| # if langdetect.detect(text) == "en": | |
| # outfile.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
| # english_count += 1 | |
| # except Exception: | |
| # continue | |
| # print(f"Filtered {english_count}/{total_count} documents to English") | |
| # print(f"English data saved to: {output_path}") | |
| # return output_path | |
| def filter_refinedweb_quality(self, input_path): | |
| """Enhanced quality filtering for RefinedWeb data""" | |
| print("Applying enhanced quality filtering for RefinedWeb...") | |
| input_path = Path(input_path) | |
| output_path = self.output_dir / "processed" / "refinedweb_filtered.jsonl" | |
| filtered_count = 0 | |
| total_count = 0 | |
| with open(input_path, "r", encoding="utf-8") as infile, \ | |
| open(output_path, "w", encoding="utf-8") as outfile: | |
| for line in tqdm(infile, desc="Quality filtering"): | |
| total_count += 1 | |
| try: | |
| obj = json.loads(line) | |
| text = obj["text"] | |
| # Enhanced quality filters for web data | |
| if self.is_high_quality_web_text(text): | |
| outfile.write(json.dumps(obj, ensure_ascii=False) + "\n") | |
| filtered_count += 1 | |
| except Exception: | |
| continue | |
| print(f"Filtered {filtered_count}/{total_count} documents for quality") | |
| print(f"Filtered data saved to: {output_path}") | |
| return output_path | |
| def is_high_quality_web_text(self, text): | |
| """Check if web text meets quality standards""" | |
| # Length checks | |
| if len(text) < 200 or len(text) > 10000: | |
| return False | |
| # Language detection | |
| try: | |
| if langdetect.detect(text) != "en": | |
| return False | |
| except: | |
| return False | |
| # Content quality checks | |
| words = text.split() | |
| if len(words) < 50: # Too short | |
| return False | |
| # Check for spam/low-quality indicators | |
| spam_indicators = ['click here', 'buy now', 'free download', '###', '***'] | |
| text_lower = text.lower() | |
| spam_count = sum(1 for indicator in spam_indicators if indicator in text_lower) | |
| if spam_count > 2: | |
| return False | |
| # Check for reasonable sentence structure | |
| sentences = text.split('.') | |
| if len(sentences) < 3: # Too few sentences | |
| return False | |
| return True | |
| def tokenize_and_pack(self, input_path, tokenizer_name="gpt2"): | |
| """Tokenize documents and pack into fixed-length sequences""" | |
| print(f"Tokenizing with {tokenizer_name} and packing to {self.seq_length} tokens...") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| input_path = Path(input_path) | |
| output_path = self.output_dir / "tokens" / f"packed_{self.seq_length}.txt" | |
| buffer = [] | |
| sequences_written = 0 | |
| total_tokens = 0 | |
| with open(input_path, "r", encoding="utf-8") as infile, \ | |
| open(output_path, "w", encoding="utf-8") as outfile: | |
| for line in tqdm(infile, desc="Processing documents"): | |
| try: | |
| text = json.loads(line)["text"] | |
| # Tokenize | |
| tokens = tokenizer.encode(text, add_special_tokens=False) | |
| # Add to buffer with EOS token | |
| buffer.extend(tokens + [tokenizer.eos_token_id]) | |
| total_tokens += len(tokens) + 1 | |
| # Pack complete sequences | |
| while len(buffer) >= self.seq_length: | |
| sequence = buffer[:self.seq_length] | |
| buffer = buffer[self.seq_length:] | |
| # Write sequence as space-separated integers | |
| outfile.write(" ".join(map(str, sequence)) + "\n") | |
| sequences_written += 1 | |
| except Exception as e: | |
| continue | |
| print(f"Created {sequences_written} sequences of {self.seq_length} tokens each") | |
| print(f"Total tokens processed: {total_tokens:,}") | |
| print(f"Packed data saved to: {output_path}") | |
| # Save tokenizer for later use | |
| tokenizer_path = self.output_dir / "tokenizer" | |
| tokenizer.save_pretrained(tokenizer_path) | |
| print(f"Tokenizer saved to: {tokenizer_path}") | |
| return output_path, tokenizer_path | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Preprocess MAP-NEO training data") | |
| parser.add_argument("--num_docs", type=int, default=10000, | |
| help="Number of documents to download") | |
| parser.add_argument("--seq_length", type=int, default=1024, | |
| help="Sequence length for packing") | |
| parser.add_argument("--tokenizer", type=str, default="gpt2", | |
| help="Tokenizer to use") | |
| parser.add_argument("--output_dir", type=str, default="data", | |
| help="Output directory") | |
| args = parser.parse_args() | |
| # Initialize preprocessor | |
| preprocessor = DataPreprocessor(args.output_dir, args.seq_length) | |
| # Run pipeline | |
| print("Starting MAP-NEO data preprocessing pipeline...") | |
| # Step 1: Download sample | |
| raw_path = preprocessor.download_refinedweb_sample(args.num_docs) | |
| # Step 2: Filter to English | |
| filtered_path = preprocessor.filter_refinedweb_quality(raw_path) | |
| # Step 3: Tokenize and pack | |
| packed_path, tokenizer_path = preprocessor.tokenize_and_pack( | |
| filtered_path, args.tokenizer | |
| ) | |
| print("\n" + "="*50) | |
| print("Data preprocessing complete!") | |
| print(f"Packed sequences: {packed_path}") | |
| print(f"Tokenizer: {tokenizer_path}") | |
| print("="*50) | |
| if __name__ == "__main__": | |
| main() |