quick fix to not OOM main speedrun script

2026-01-26 22:31:42 +00:00
parent 85b3e95e09
commit 8630d32be4
2 changed files with 3 additions and 3 deletions
@@ -58,8 +58,8 @@ python -m nanochat.dataset -n 8
 # See comment below for why 370 is the right number here
 python -m nanochat.dataset -n 370 &
 DATASET_DOWNLOAD_PID=$!
-# train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data
-python -m scripts.tok_train --max-chars=2000000000 --vocab-size=65536
+# train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data
+python -m scripts.tok_train
 # evaluate the tokenizer (report compression ratio etc.)
 python -m scripts.tok_eval