#!/bin/bash # End-to-end smoke for upstream nanochat: dataset → tokenizer → tiny base_train. # Idempotent: caches venv, uv-cache, and dataset shards under /data/nanochat-smoke/. # Targets ailab (CN, RTX 5090). Uses CN mirrors for PyPI / pytorch wheels / HuggingFace. set -euo pipefail export PATH="$HOME/.local/bin:$PATH" CACHE_ROOT=${CACHE_ROOT:-/data/nanochat-smoke} NANOCHAT_DIR=$CACHE_ROOT/nanochat export NANOCHAT_BASE_DIR=$CACHE_ROOT/cache export UV_CACHE_DIR=$CACHE_ROOT/uv-cache export UV_DEFAULT_INDEX=${UV_DEFAULT_INDEX:-https://mirrors.aliyun.com/pypi/simple/} export UV_INDEX_STRATEGY=unsafe-best-match export HF_ENDPOINT=${HF_ENDPOINT:-https://hf-mirror.com} export OMP_NUM_THREADS=1 mkdir -p "$CACHE_ROOT" "$NANOCHAT_BASE_DIR" # wandb: real run if WANDB_API_KEY is set, otherwise fall back to dummy (DummyWandb). if [ -n "${WANDB_API_KEY:-}" ]; then RUN_TAG=${CI_RUN_TAG:-smoke-$(date +%Y%m%d-%H%M%S)} else RUN_TAG=dummy fi if [ ! -d "$NANOCHAT_DIR" ]; then echo "Cloning nanochat into $NANOCHAT_DIR" git clone https://github.com/karpathy/nanochat.git "$NANOCHAT_DIR" \ || git clone https://ghfast.top/https://github.com/karpathy/nanochat.git "$NANOCHAT_DIR" fi cd "$NANOCHAT_DIR" # CN mirror patches — idempotent: sed only matches the upstream URLs. sed -i \ -e "s|https://download.pytorch.org/whl/cu128|https://mirror.sjtu.edu.cn/pytorch-wheels/cu128|g" \ -e "s|https://download.pytorch.org/whl/cpu|https://mirror.sjtu.edu.cn/pytorch-wheels/cpu|g" \ pyproject.toml uv.lock sed -i \ -e "s|https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle|https://hf-mirror.com/datasets/karpathy/climbmix-400b-shuffle|" \ nanochat/dataset.py [ -d .venv ] || uv venv uv sync --extra gpu --index-strategy unsafe-best-match source .venv/bin/activate echo "=== [1/4] download 1 climbmix shard ===" time python -m nanochat.dataset -n 1 echo "=== [2/4] train tokenizer (50M chars) ===" time python -m scripts.tok_train --max-chars=50000000 echo "=== [3/4] tok_eval ===" time python -m scripts.tok_eval echo "=== [4/4] base_train d=6 50 iters ===" time python -m scripts.base_train \ --depth=6 \ --head-dim=64 \ --window-pattern=L \ --max-seq-len=512 \ --device-batch-size=8 \ --total-batch-size=4096 \ --eval-every=25 \ --eval-tokens=131072 \ --core-metric-every=-1 \ --sample-every=25 \ --num-iterations=50 \ --run="$RUN_TAG" echo "=== smoke done ==="