something i've wanted to do for a while - move all .sh runs to their own directory so they don't pollute root dir
This commit is contained in:
@@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
|
||||
# See speedrun.sh for more comments
|
||||
# Usage: ./miniseries.sh [series_name]
|
||||
# Example: ./miniseries.sh jan11
|
||||
# Default series name is today's date (e.g., jan11)
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
||||
mkdir -p $NANOCHAT_BASE_DIR
|
||||
|
||||
# Setup (skip with SKIP_SETUP=1)
|
||||
if [ -z "$SKIP_SETUP" ]; then
|
||||
# uv
|
||||
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
[ -d ".venv" ] || uv venv
|
||||
uv sync --extra gpu
|
||||
source .venv/bin/activate
|
||||
|
||||
# Tokenizer, download 1000 shards for pretraining
|
||||
# (probably this can be reduced but it's tricky to determine the exact right number, TODO).
|
||||
python -m nanochat.dataset -n 1000
|
||||
python -m scripts.tok_train --max-chars=2000000000 --vocab-size=32768
|
||||
else
|
||||
source .venv/bin/activate
|
||||
fi
|
||||
|
||||
# Series name: from arg, env var, or default to today's date (e.g., jan11)
|
||||
SERIES_NAME="${1:-${SERIES_NAME:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}}"
|
||||
# Depths to train (the "miniseries")
|
||||
DEPTHS=(10 11 12 13 14 15 16 17 18 19 20)
|
||||
# Hardware
|
||||
NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
|
||||
# Logging
|
||||
WANDB_RUN="${WANDB_RUN:-${SERIES_NAME}_miniseries}"
|
||||
|
||||
RESULTS_DIR="$NANOCHAT_BASE_DIR/${SERIES_NAME}_miniseries_results"
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
RESULTS_FILE="$RESULTS_DIR/results.csv"
|
||||
|
||||
# Write CSV header only if file doesn't exist
|
||||
if [ ! -f "$RESULTS_FILE" ]; then
|
||||
echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE"
|
||||
fi
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
|
||||
}
|
||||
|
||||
log "=============================================="
|
||||
log "${SERIES_NAME} Miniseries Training"
|
||||
log "=============================================="
|
||||
|
||||
for d in "${DEPTHS[@]}"; do
|
||||
log "Training d=$d..."
|
||||
|
||||
TAG="${SERIES_NAME}_miniseries_d${d}"
|
||||
START_TIME=$(date +%s)
|
||||
|
||||
# Train the model with natural horizon (target_param_data_ratio default)
|
||||
# No --target-flops, let it use the default ratio from base_train
|
||||
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \
|
||||
--depth=$d \
|
||||
--run="${WANDB_RUN}_d${d}" \
|
||||
--model-tag="${TAG}" \
|
||||
--core-metric-every=999999 \
|
||||
--core-metric-max-per-task=-1 \
|
||||
--sample-every=-1 \
|
||||
--save-every=-1 \
|
||||
2>&1 | tee "$RESULTS_DIR/${TAG}_train.log"
|
||||
|
||||
END_TIME=$(date +%s)
|
||||
TRAIN_TIME=$((END_TIME - START_TIME))
|
||||
|
||||
# Extract stats from log
|
||||
LOG_FILE="$RESULTS_DIR/${TAG}_train.log"
|
||||
NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',')
|
||||
NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',')
|
||||
NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',')
|
||||
TOKENS_TRAINED=$((NUM_ITERS * 524288))
|
||||
PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')")
|
||||
MODEL_DIM=$((d * 64))
|
||||
VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$')
|
||||
CORE_SCORE=$(grep "CORE metric:" "$LOG_FILE" | tail -1 | awk '{print $NF}')
|
||||
|
||||
if [ -z "$CORE_SCORE" ]; then
|
||||
CORE_SCORE="0.0"
|
||||
fi
|
||||
|
||||
log " d=$d: params=$NUM_PARAMS, scaling=$NUM_SCALING_PARAMS, ratio=$PARAM_DATA_RATIO, bpb=$VAL_BPB, CORE=$CORE_SCORE, time=${TRAIN_TIME}s"
|
||||
|
||||
# Append to CSV
|
||||
echo "$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE"
|
||||
done
|
||||
|
||||
log "=============================================="
|
||||
log "${SERIES_NAME} Miniseries Complete!"
|
||||
log "=============================================="
|
||||
log "Results saved to: $RESULTS_FILE"
|
||||
echo ""
|
||||
echo "Results:"
|
||||
column -t -s',' "$RESULTS_FILE"
|
||||
Reference in New Issue
Block a user