merge two files base_loss and base_eval into a single file, it's nicer this way, and unify the huggingface code associated with both

This commit is contained in:
Andrej Karpathy
2026-02-01 02:36:43 +00:00
parent 1ddaad1c1c
commit 0307997f9b
5 changed files with 219 additions and 261 deletions
+1 -2
View File
@@ -142,8 +142,7 @@ I've published a number of guides that might contain helpful information:
│ ├── scaling_laws.sh # Scaling laws experiments │ ├── scaling_laws.sh # Scaling laws experiments
│ └── speedrun.sh # Train the ~$100 nanochat d20 │ └── speedrun.sh # Train the ~$100 nanochat d20
├── scripts ├── scripts
│ ├── base_eval.py # Base model: calculate CORE score │ ├── base_eval.py # Base model: CORE score, bits per byte, samples
│ ├── base_loss.py # Base model: calculate bits per byte, sample
│ ├── base_train.py # Base model: train │ ├── base_train.py # Base model: train
│ ├── chat_cli.py # Chat model: talk to over CLI │ ├── chat_cli.py # Chat model: talk to over CLI
│ ├── chat_eval.py # Chat model: eval tasks │ ├── chat_eval.py # Chat model: eval tasks
+1 -2
View File
@@ -42,8 +42,7 @@ python -m scripts.base_train \
--sample-every=100 \ --sample-every=100 \
--num-iterations=5000 \ --num-iterations=5000 \
--run=$WANDB_RUN --run=$WANDB_RUN
python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384 python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16
python -m scripts.base_eval --max-per-task=16
# SFT (~10 minutes on my MacBook Pro M3 Max) # SFT (~10 minutes on my MacBook Pro M3 Max)
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
+1 -3
View File
@@ -74,9 +74,7 @@ NPROC_PER_NODE=8
# d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12)
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN
# evaluate the model on a larger chunk of train/val data and draw some samples # evaluate the model: CORE metric, BPB on train/val, and draw samples
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
# evaluate the model on CORE tasks
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
+216 -99
View File
@@ -1,13 +1,23 @@
""" """
Evaluate the CORE metric for a given model. Unified evaluation script for base models.
Run on a single GPU: Supports three evaluation modes (comma-separated):
python -m scripts.base_eval --eval core : CORE metric (accuracy on ICL tasks)
--eval bpb : Bits per byte on train/val splits
--eval sample : Generate samples from the model
Run with torchrun on e.g. 8 GPUs: Default is all three: --eval core,bpb,sample
torchrun --nproc_per_node=8 -m scripts.base_eval
The script will print the CORE metric to the console. Examples:
# Evaluate a HuggingFace model (e.g. GPT-2 124M) using 8 GPUs
torchrun --nproc_per_node=8 -m scripts.base_eval --hf-path openai-community/gpt2
# Evaluate a nanochat model (e.g. d24) using 8 GPUs
torchrun --nproc_per_node=8 -m scripts.base_eval --model-tag d24 --device-batch-size=16
# Quick/approximate evaluation using a single GPU
python -m scripts.base_eval --model-tag d24 --device-batch-size=16 --max-per-task=100 --split-tokens=524288
""" """
import os import os
import csv import csv
@@ -18,24 +28,74 @@ import shutil
import random import random
import zipfile import zipfile
import tempfile import tempfile
import argparse
from contextlib import nullcontext from contextlib import nullcontext
import torch import torch
from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
from nanochat.tokenizer import HuggingFaceTokenizer from nanochat.tokenizer import HuggingFaceTokenizer, get_token_bytes
from nanochat.checkpoint_manager import load_model from nanochat.checkpoint_manager import load_model
from nanochat.core_eval import evaluate_task from nanochat.core_eval import evaluate_task
from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit
from nanochat.loss_eval import evaluate_bpb
from nanochat.engine import Engine
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# nanochat specific function dealing with I/O etc. # HuggingFace loading utilities
class ModelWrapper:
"""Lightweight wrapper to give HuggingFace models a nanochat-compatible interface."""
def __init__(self, model, max_seq_len=None):
self.model = model
self.max_seq_len = max_seq_len
def __call__(self, input_ids, targets=None, loss_reduction='mean'):
logits = self.model(input_ids).logits
if targets is None:
return logits
loss = torch.nn.functional.cross_entropy(
logits.view(-1, logits.size(-1)),
targets.view(-1),
ignore_index=-1,
reduction=loss_reduction
)
return loss
def get_device(self):
return next(self.model.parameters()).device
def load_hf_model(hf_path: str, device):
"""Load a HuggingFace model and tokenizer."""
print0(f"Loading HuggingFace model from: {hf_path}")
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(hf_path)
model.to(device)
model.eval()
max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
model = ModelWrapper(model, max_seq_len=max_seq_len)
tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
return model, tokenizer
def get_hf_token_bytes(tokenizer, device="cpu"):
"""Compute token_bytes tensor for a HuggingFace tokenizer."""
vocab_size = tokenizer.tokenizer.get_vocab_size()
token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device)
for token_id in range(vocab_size):
token_str = tokenizer.tokenizer.decode([token_id])
token_bytes[token_id] = len(token_str.encode('utf-8'))
return token_bytes
# -----------------------------------------------------------------------------
# CORE evaluation
# ~162MB of data needed to evaluate the CORE metric
EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
def place_eval_bundle(file_path): def place_eval_bundle(file_path):
# here file_path is the path to the eval_bundle.zip file """Unzip eval_bundle.zip and place it in the base directory."""
# we need to unzip it and place it in the base directory
base_dir = get_base_dir() base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle") eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
@@ -45,25 +105,27 @@ def place_eval_bundle(file_path):
shutil.move(extracted_bundle_dir, eval_bundle_dir) shutil.move(extracted_bundle_dir, eval_bundle_dir)
print0(f"Placed eval_bundle directory at {eval_bundle_dir}") print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
def evaluate_model(model, tokenizer, device, max_per_task=-1):
def evaluate_core(model, tokenizer, device, max_per_task=-1):
""" """
Evaluate a base model on the CORE benchmark. Evaluate a base model on the CORE benchmark.
- max_per_task: crop the data to this many examples per task for testing (-1 = disable) Returns dict with results, centered_results, and core_metric.
""" """
# Load config and task metadata
base_dir = get_base_dir() base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle") eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
# Download the eval bundle to disk (and unzip if needed) # Download the eval bundle if needed
if not os.path.exists(eval_bundle_dir): if not os.path.exists(eval_bundle_dir):
download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle) download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
config_path = os.path.join(eval_bundle_dir, "core.yaml") config_path = os.path.join(eval_bundle_dir, "core.yaml")
data_base_path = os.path.join(eval_bundle_dir, "eval_data") data_base_path = os.path.join(eval_bundle_dir, "eval_data")
eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
with open(config_path, 'r', encoding='utf-8') as f: with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
tasks = config['icl_tasks'] tasks = config['icl_tasks']
# Load random baseline values from eval metadata # Load random baseline values
random_baselines = {} random_baselines = {}
with open(eval_meta_data, 'r', encoding='utf-8') as f: with open(eval_meta_data, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f) reader = csv.DictReader(f)
@@ -86,27 +148,23 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
} }
print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')
# Load data for this task
data_path = os.path.join(data_base_path, task_meta['dataset_uri']) data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
with open(data_path, 'r', encoding='utf-8') as f: with open(data_path, 'r', encoding='utf-8') as f:
data = [json.loads(line.strip()) for line in f] data = [json.loads(line.strip()) for line in f]
# shuffle the data because in many cases it appears ordered but we want # Shuffle for consistent subsampling when using max_per_task
# the ability to only run a subset of the data for debugging purposes etc.
shuffle_rng = random.Random(1337) shuffle_rng = random.Random(1337)
shuffle_rng.shuffle(data) shuffle_rng.shuffle(data)
if max_per_task > 0: if max_per_task > 0:
data = data[:max_per_task] data = data[:max_per_task]
# run the evaluation for this task
accuracy = evaluate_task(model, tokenizer, data, device, task_meta) accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
results[label] = accuracy results[label] = accuracy
random_baseline = random_baselines[label] random_baseline = random_baselines[label]
centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
centered_results[label] = centered_result centered_results[label] = centered_result
end_time = time.time() elapsed = time.time() - start_time
print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {elapsed:.2f}s")
core_metric = sum(centered_results.values()) / len(centered_results) core_metric = sum(centered_results.values()) / len(centered_results)
out = { out = {
@@ -117,98 +175,157 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
return out return out
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# HuggingFace loading utilities and light wrappers for a model # Main
class ModelWrapper:
"""Lightweight wrapper for a HuggingFace model"""
def __init__(self, model, max_seq_len=None):
self.model = model
self.max_seq_len = max_seq_len
def __call__(self, input_ids):
outputs = self.model(input_ids)
logits = outputs.logits
return logits
def load_hf_model(hf_path: str, device):
print0(f"Loading model from: {hf_path}")
# Load the model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(hf_path)
model.to(device)
model.eval()
max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
model = ModelWrapper(model, max_seq_len=max_seq_len)
# Load the tokenizer
tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
return model, tokenizer
# -----------------------------------------------------------------------------
def main(): def main():
import argparse parser = argparse.ArgumentParser(description="Base model evaluation")
parser = argparse.ArgumentParser() parser.add_argument('--eval', type=str, default='core,bpb,sample', help='Comma-separated evaluations to run: core,bpb,sample (default: all)')
parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2)')
parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') parser.add_argument('--model-tag', type=str, default=None, help='nanochat model tag to identify the checkpoint directory')
parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name') parser.add_argument('--step', type=int, default=None, help='Model step to load (default = last)')
parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name') parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per CORE task (-1 = all)')
parser.add_argument('--device-batch-size', type=int, default=32, help='Per-device batch size for BPB evaluation')
parser.add_argument('--split-tokens', type=int, default=40*524288, help='Number of tokens to evaluate per split for BPB')
parser.add_argument('--device-type', type=str, default='', help='cuda|cpu|mps (empty = autodetect)')
args = parser.parse_args() args = parser.parse_args()
# distributed / precision setup # Parse evaluation modes
device_type = autodetect_device_type() eval_modes = set(mode.strip() for mode in args.eval.split(','))
valid_modes = {'core', 'bpb', 'sample'}
invalid = eval_modes - valid_modes
if invalid:
parser.error(f"Invalid eval modes: {invalid}. Valid: {valid_modes}")
# Distributed / precision setup
device_type = autodetect_device_type() if args.device_type == '' else args.device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
# Load model and tokenizer from command line or from file system # Load model and tokenizer
if args.hf_path is not None: is_hf_model = args.hf_path is not None
# atm assume that if a path is given, it's a huggingface model path if is_hf_model:
hf_path = args.hf_path model, tokenizer = load_hf_model(args.hf_path, device)
print0(f"Loading huggingface model from: {hf_path}") sequence_len = model.max_seq_len or 1024
model, tokenizer = load_hf_model(hf_path, device) token_bytes = get_hf_token_bytes(tokenizer, device=device)
model_name = hf_path # just for logging model_name = args.hf_path
model_slug = hf_path.replace("/", "-") # for the output csv file model_slug = args.hf_path.replace("/", "-")
else: else:
# load a local model from the file system
model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step) model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step)
model_name = f"base_model (step {meta['step']})" # just for logging sequence_len = meta["model_config"]["sequence_len"]
model_slug = f"base_model_{meta['step']:06d}" # for the output csv file token_bytes = get_token_bytes(device=device)
model_name = f"base_model (step {meta['step']})"
model_slug = f"base_model_{meta['step']:06d}"
# Evaluate the model print0(f"Evaluating model: {model_name}")
with autocast_ctx: print0(f"Eval modes: {', '.join(sorted(eval_modes))}")
out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)
# Write out the results to a csv file # Results to log
core_metric = None core_results = None
centered_results = {} bpb_results = {}
if ddp_rank == 0: samples = []
base_dir = get_base_dir() unconditioned_samples = []
output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) # --- CORE evaluation ---
results = out["results"] if 'core' in eval_modes:
centered_results = out["centered_results"] print0("\n" + "="*80)
core_metric = out["core_metric"] print0("CORE Evaluation")
with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
for label in results:
f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")
f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
# Print the content of the csv file to console too
print0("="*80) print0("="*80)
print0(f"Model: {model_name}") with autocast_ctx:
print0("="*80) core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task)
with open(output_csv_path, 'r', encoding='utf-8') as f:
print0(f.read())
# Log to report # Write CSV output
if ddp_rank == 0:
base_dir = get_base_dir()
output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
for label in core_results["results"]:
acc = core_results["results"][label]
centered = core_results["centered_results"][label]
f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n")
f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n")
print0(f"\nResults written to: {output_csv_path}")
print0(f"CORE metric: {core_results['core_metric']:.4f}")
# --- BPB evaluation ---
if 'bpb' in eval_modes:
print0("\n" + "="*80)
print0("BPB Evaluation")
print0("="*80)
tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size
if args.split_tokens % tokens_per_step != 0:
# Adjust to nearest multiple
args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step
print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})")
steps = args.split_tokens // tokens_per_step
for split_name in ["train", "val"]:
loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device)
with autocast_ctx:
bpb = evaluate_bpb(model, loader, steps, token_bytes)
bpb_results[split_name] = bpb
print0(f"{split_name} bpb: {bpb:.6f}")
# --- Sampling ---
if 'sample' in eval_modes and not is_hf_model:
print0("\n" + "="*80)
print0("Model Samples")
print0("="*80)
if ddp_rank == 0:
prompts = [
"The capital of France is",
"The chemical symbol of gold is",
"If yesterday was Friday, then tomorrow will be",
"The opposite of hot is",
"The planets of the solar system are:",
"My favorite color is",
"If 5*x + 3 = 13, then x is",
]
engine = Engine(model, tokenizer)
print0("\nConditioned samples:")
for prompt in prompts:
tokens = tokenizer(prompt, prepend="<|bos|>")
with autocast_ctx:
sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
sample_str = tokenizer.decode(sample[0])
print0("-" * 80)
print0(sample_str)
samples.append(sample_str)
print0("\nUnconditioned samples:")
tokens = tokenizer("", prepend="<|bos|>")
with autocast_ctx:
uncond, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0)
for sample in uncond:
sample_str = tokenizer.decode(sample)
print0("-" * 80)
print0(sample_str)
unconditioned_samples.append(sample_str)
elif 'sample' in eval_modes and is_hf_model:
print0("\nSkipping sampling for HuggingFace models (not supported)")
# --- Log to report ---
from nanochat.report import get_report from nanochat.report import get_report
get_report().log(section="Base model evaluation", data=[ report_data = [{"model": model_name}]
{
"Model": model_name, if core_results:
"CORE metric": core_metric, report_data[0]["CORE metric"] = core_results["core_metric"]
}, report_data.append(core_results["centered_results"])
centered_results, # the full table
]) if bpb_results:
report_data[0]["train bpb"] = bpb_results.get("train")
report_data[0]["val bpb"] = bpb_results.get("val")
if samples:
report_data.append({f"sample {i}": s for i, s in enumerate(samples)})
if unconditioned_samples:
report_data.append({f"unconditioned {i}": s for i, s in enumerate(unconditioned_samples)})
get_report().log(section="Base model evaluation", data=report_data)
compute_cleanup() compute_cleanup()
if __name__ == "__main__": if __name__ == "__main__":
main() main()
-155
View File
@@ -1,155 +0,0 @@
"""
Loads a checkpoint, and:
- Evaluates the loss on a larger chunk of train/val splits
- Samples from the model
Example run as:
torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
To evaluate a HuggingFace model:
python -m scripts.base_loss --hf-path openai-community/gpt2
"""
import argparse
from contextlib import nullcontext
import torch
from nanochat.checkpoint_manager import load_model
from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type
from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit
from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer
from nanochat.loss_eval import evaluate_bpb
from nanochat.engine import Engine
# -----------------------------------------------------------------------------
# HuggingFace loading utilities, making the APIs match up to those of nanochat
class ModelWrapper:
"""Lightweight wrapper for a HuggingFace model"""
def __init__(self, model, max_seq_len=None):
self.model = model
self.max_seq_len = max_seq_len
def __call__(self, input_ids, targets=None, loss_reduction='mean'):
logits = self.model(input_ids).logits
if targets is None:
return logits
else:
loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction)
return loss
def get_device(self):
return next(self.model.parameters()).device
def load_hf_model(hf_path: str, device):
print0(f"Loading model from: {hf_path}")
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(hf_path)
model.to(device)
model.eval()
max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
model = ModelWrapper(model, max_seq_len=max_seq_len)
tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
return model, tokenizer
def get_hf_token_bytes(tokenizer, device="cpu"):
"""Compute token_bytes tensor for a HuggingFace tokenizer."""
vocab_size = tokenizer.tokenizer.get_vocab_size()
token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device)
for token_id in range(vocab_size):
token_str = tokenizer.tokenizer.decode([token_id])
token_bytes[token_id] = len(token_str.encode('utf-8')) # Count UTF-8 bytes
return token_bytes
# CLI arguments
parser = argparse.ArgumentParser(description="Evaluate loss on train/val splits and sample from model")
parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size")
parser.add_argument("--split-tokens", type=int, default=40*524288, help="number of tokens to evaluate per split")
parser.add_argument("--model-tag", type=str, default=None, help="model tag for checkpoint directory")
parser.add_argument("--model-step", type=int, default=None, help="model step to load")
parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)")
parser.add_argument("--hf-path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)")
args = parser.parse_args()
# Load the base model and the tokenizer
device_type = autodetect_device_type() if args.device_type == "" else args.device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
print0(f"Device: {device} | DDP rank: {ddp_rank} | DDP local rank: {ddp_local_rank} | DDP world size: {ddp_world_size}")
if args.hf_path is not None:
# Load HuggingFace model
model, tokenizer = load_hf_model(args.hf_path, device)
sequence_len = model.max_seq_len if model.max_seq_len else 1024
token_bytes = get_hf_token_bytes(tokenizer, device=device)
model_name = args.hf_path
else:
# Load local nanochat model
model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step)
sequence_len = meta["model_config"]["sequence_len"]
token_bytes = get_token_bytes(device=device)
model_name = f"base_model (step {meta['step']})"
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
print0(f"Evaluating model: {model_name}")
# Evaluate the loss on each split
tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size
assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step"
steps = args.split_tokens // tokens_per_step
bpb_results = {}
for split_name in ["train", "val"]:
loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device)
with autocast_ctx:
bpb = evaluate_bpb(model, loader, steps, token_bytes)
print0(f"{split_name} bpb: {bpb:.4f}")
bpb_results[split_name] = bpb
print0(f"Model: {model_name}, {split_name} bpb: {bpb:.6f}")
# Master process also samples from the model for some basic knowledge-eliciting prompts (only for nanochat models)
samples = []
if ddp_rank == 0 and args.hf_path is None:
prompts = [
"The capital of France is",
"The chemical symbol of gold is",
"If yesterday was Friday, then tomorrow will be",
"The opposite of hot is",
"The planets of the solar system are:",
"My favorite color is",
"If 5*x + 3 = 13, then x is",
]
engine = Engine(model, tokenizer)
for prompt in prompts:
tokens = tokenizer(prompt, prepend="<|bos|>")
with autocast_ctx:
sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
sample_str = tokenizer.decode(sample[0])
print0("-" * 80)
print0(sample_str)
samples.append(sample_str)
# Draw some unconditioned samples from the model (only for nanochat models)
unconditioned_samples = []
if ddp_rank == 0 and args.hf_path is None:
engine = Engine(model, tokenizer)
tokens = tokenizer("", prepend="<|bos|>")
with autocast_ctx:
samples, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0)
for sample in samples:
sample_str = tokenizer.decode(sample)
print0("-" * 80)
print0(sample_str)
unconditioned_samples.append(sample_str)
# Log to report
from nanochat.report import get_report
get_report().log(section="Base model loss", data=[
{
"model": model_name,
"train bpb": bpb_results["train"],
"val bpb": bpb_results["val"],
},
{f"sample {i}": sample for i, sample in enumerate(samples)},
{f"unconditioned sample {i}": sample for i, sample in enumerate(unconditioned_samples)},
])
# Cleanup
compute_cleanup()