fix buggy midtrain and update all kwargs to be idiomatic. that is, argparse uses dashes variables use underscores. the underscores are just a remnant of the previous Configurator object. This is the right way

This commit is contained in:
Andrej Karpathy
2026-01-13 22:45:27 +00:00
parent 3b50b77ed3
commit 7312ec9898
11 changed files with 144 additions and 139 deletions
+8 -8
View File
@@ -20,7 +20,7 @@ if [ -z "$SKIP_SETUP" ]; then
# Tokenizer, download 1000 shards for pretraining
# (probably this can be reduced but it's tricky to determine the exact right number, TODO).
python -m nanochat.dataset -n 1000
python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768
python -m scripts.tok_train --max-chars=2000000000 --vocab-size=32768
else
source .venv/bin/activate
fi
@@ -58,16 +58,16 @@ for d in "${DEPTHS[@]}"; do
START_TIME=$(date +%s)
# Train the model with natural horizon (target_param_data_ratio default)
# No --target_flops, let it use the default ratio from base_train
# No --target-flops, let it use the default ratio from base_train
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \
--depth=$d \
--target_param_data_ratio=8 \
--target-param-data-ratio=8 \
--run="${WANDB_RUN}_d${d}" \
--model_tag="${TAG}" \
--core_metric_every=999999 \
--core_metric_max_per_task=-1 \
--sample_every=-1 \
--save_every=-1 \
--model-tag="${TAG}" \
--core-metric-every=999999 \
--core-metric-max-per-task=-1 \
--sample-every=-1 \
--save-every=-1 \
2>&1 | tee "$RESULTS_DIR/${TAG}_train.log"
END_TIME=$(date +%s)