nudge hyperparameters of the base script with the results of the sweeps and miniseries. vocab size down to 32K. D:N ratio from 20 to 8. add miniseries script

2026-01-07 22:11:52 +00:00
parent 1b5de29e71
commit ccf4b7f9bf
9 changed files with 333 additions and 21 deletions
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# See speedrun.sh for more comments
+
+export OMP_NUM_THREADS=1
+export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
+mkdir -p $NANOCHAT_BASE_DIR
+
+# uv
+command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
+[ -d ".venv" ] || uv venv
+uv sync --extra gpu
+source .venv/bin/activate
+
+# Tokenizer
+python -m nanochat.dataset -n 240
+python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768
+
+# Depths to train (the "miniseries")
+DEPTHS=(10 11 12 13 14 15 16 17 18 19 20)
+# Hardware
+NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+# Logging
+WANDB_RUN="${WANDB_RUN:-jan7_miniseries}"
+
+RESULTS_DIR="$NANOCHAT_BASE_DIR/jan7_miniseries_results"
+mkdir -p "$RESULTS_DIR"
+RESULTS_FILE="$RESULTS_DIR/results.csv"
+
+# Write CSV header
+echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE"
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+log "=============================================="
+log "Jan 7 Miniseries Training"
+log "=============================================="
+
+for d in "${DEPTHS[@]}"; do
+    log "Training d=$d..."
+
+    TAG="jan7_miniseries_d${d}"
+    START_TIME=$(date +%s)
+
+    # Train the model with natural horizon (target_param_data_ratio default)
+    # No --target_flops, let it use the default ratio from base_train
+    torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \
+        --depth=$d \
+        --target_param_data_ratio=8 \
+        --run="${WANDB_RUN}_d${d}" \
+        --model_tag="${TAG}" \
+        --core_metric_every=999999 \
+        --core_metric_max_per_task=-1 \
+        --sample_every=-1 \
+        --save_every=-1 \
+        2>&1 | tee "$RESULTS_DIR/${TAG}_train.log"
+
+    END_TIME=$(date +%s)
+    TRAIN_TIME=$((END_TIME - START_TIME))
+
+    # Extract stats from log
+    LOG_FILE="$RESULTS_DIR/${TAG}_train.log"
+    NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',')
+    NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',')
+    NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',')
+    TOKENS_TRAINED=$((NUM_ITERS * 524288))
+    PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')")
+    MODEL_DIM=$((d * 64))
+    VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$')
+    CORE_SCORE=$(grep "CORE metric:" "$LOG_FILE" | tail -1 | awk '{print $NF}')
+
+    if [ -z "$CORE_SCORE" ]; then
+        CORE_SCORE="0.0"
+    fi
+
+    log "  d=$d: params=$NUM_PARAMS, scaling=$NUM_SCALING_PARAMS, ratio=$PARAM_DATA_RATIO, bpb=$VAL_BPB, CORE=$CORE_SCORE, time=${TRAIN_TIME}s"
+
+    # Append to CSV
+    echo "$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE"
+done
+
+log "=============================================="
+log "Jan 7 Miniseries Complete!"
+log "=============================================="
+log "Results saved to: $RESULTS_FILE"
+echo ""
+echo "Results:"
+column -t -s',' "$RESULTS_FILE"