nanochat-omni/scripts/smoke.sh

#!/bin/bash
# End-to-end smoke for nanochat-omni: dataset → tokenizer → tiny base_train.
# Runs in-place at the repo root (post-monorepo-fork).
# Idempotent: caches venv + uv-cache + dataset shards under /data/nanochat-smoke/.
# Targets ailab (CN, RTX 5090). CN mirrors (sjtu pytorch, aliyun PyPI, hf-mirror)
# are committed directly into pyproject.toml/uv.lock/nanochat/dataset.py.
set -euo pipefail
export PATH="$HOME/.local/bin:$PATH"

CACHE_ROOT=${CACHE_ROOT:-/data/nanochat-smoke}
export NANOCHAT_BASE_DIR=$CACHE_ROOT/cache
export UV_CACHE_DIR=$CACHE_ROOT/uv-cache
export UV_DEFAULT_INDEX=${UV_DEFAULT_INDEX:-https://mirrors.aliyun.com/pypi/simple/}
export UV_INDEX_STRATEGY=unsafe-best-match
export HF_ENDPOINT=${HF_ENDPOINT:-https://hf-mirror.com}
export OMP_NUM_THREADS=1

mkdir -p "$CACHE_ROOT" "$NANOCHAT_BASE_DIR"

# wandb: real run if WANDB_API_KEY is set, otherwise fall back to dummy (DummyWandb).
if [ -n "${WANDB_API_KEY:-}" ]; then
    RUN_TAG=${CI_RUN_TAG:-smoke-$(date +%Y%m%d-%H%M%S)}
else
    RUN_TAG=dummy
fi

[ -d .venv ] || uv venv
uv sync --extra gpu --index-strategy unsafe-best-match
source .venv/bin/activate

echo "=== [1/4] download 1 climbmix shard ==="
time python -m nanochat.dataset -n 1

echo "=== [2/4] train tokenizer (50M chars) ==="
time python -m scripts.tok_train --max-chars=50000000

echo "=== [3/4] tok_eval ==="
time python -m scripts.tok_eval

echo "=== [4/4] base_train d=6 50 iters ==="
time python -m scripts.base_train \
    --depth=6 \
    --head-dim=64 \
    --window-pattern=L \
    --max-seq-len=512 \
    --device-batch-size=8 \
    --total-batch-size=4096 \
    --eval-every=25 \
    --eval-tokens=131072 \
    --core-metric-every=-1 \
    --sample-every=25 \
    --num-iterations=50 \
    --run="$RUN_TAG"

echo "=== smoke done ==="