merge two files base_loss and base_eval into a single file, it's nicer this way, and unify the huggingface code associated with both

This commit is contained in:
Andrej Karpathy
2026-02-01 02:36:43 +00:00
parent 1ddaad1c1c
commit 0307997f9b
5 changed files with 219 additions and 261 deletions
+1 -2
View File
@@ -42,8 +42,7 @@ python -m scripts.base_train \
--sample-every=100 \
--num-iterations=5000 \
--run=$WANDB_RUN
python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384
python -m scripts.base_eval --max-per-task=16
python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16
# SFT (~10 minutes on my MacBook Pro M3 Max)
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
+1 -3
View File
@@ -74,9 +74,7 @@ NPROC_PER_NODE=8
# d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12)
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN
# evaluate the model on a larger chunk of train/val data and draw some samples
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
# evaluate the model on CORE tasks
# evaluate the model: CORE metric, BPB on train/val, and draw samples
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
# -----------------------------------------------------------------------------