ci: nanochat smoke on ailab gpu runner
smoke / nanochat-smoke (push) Failing after 7s

- scripts/smoke.sh: end-to-end nanochat pipeline (dataset → tokenizer
  → tiny d=6 base_train, 50 iters); idempotent, caches under
  /data/nanochat-smoke (venv, uv-cache, shards).
- CN mirrors baked in: aliyun PyPI, sjtu pytorch-wheels/cu128, hf-mirror.
- workflow targets gpu label (ailab). drop earlier dummy workflow.
This commit is contained in:
Fam Zheng
2026-05-05 21:57:22 +01:00
parent b33e6af06c
commit c1108ae01f
3 changed files with 80 additions and 14 deletions
-14
View File
@@ -1,14 +0,0 @@
name: dummy
on:
push:
workflow_dispatch:
jobs:
hello:
runs-on: ailab
steps:
- name: Greet
run: |
echo "hello from $(hostname) as $(whoami)"
uname -a
nvidia-smi --query-gpu=name,memory.free --format=csv,noheader || true
+15
View File
@@ -0,0 +1,15 @@
name: smoke
on:
push:
workflow_dispatch:
jobs:
nanochat-smoke:
runs-on: gpu
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- name: nvidia-smi
run: nvidia-smi --query-gpu=name,memory.free,memory.used --format=csv
- name: smoke
run: bash scripts/smoke.sh
+65
View File
@@ -0,0 +1,65 @@
#!/bin/bash
# End-to-end smoke for upstream nanochat: dataset → tokenizer → tiny base_train.
# Idempotent: caches venv, uv-cache, and dataset shards under /data/nanochat-smoke/.
# Targets ailab (CN, RTX 5090). Uses CN mirrors for PyPI / pytorch wheels / HuggingFace.
set -euo pipefail
export PATH="$HOME/.local/bin:$PATH"
CACHE_ROOT=${CACHE_ROOT:-/data/nanochat-smoke}
NANOCHAT_DIR=$CACHE_ROOT/nanochat
export NANOCHAT_BASE_DIR=$CACHE_ROOT/cache
export UV_CACHE_DIR=$CACHE_ROOT/uv-cache
export UV_DEFAULT_INDEX=${UV_DEFAULT_INDEX:-https://mirrors.aliyun.com/pypi/simple/}
export UV_INDEX_STRATEGY=unsafe-best-match
export HF_ENDPOINT=${HF_ENDPOINT:-https://hf-mirror.com}
export OMP_NUM_THREADS=1
mkdir -p "$CACHE_ROOT" "$NANOCHAT_BASE_DIR"
if [ ! -d "$NANOCHAT_DIR" ]; then
echo "Cloning nanochat into $NANOCHAT_DIR"
git clone https://github.com/karpathy/nanochat.git "$NANOCHAT_DIR" \
|| git clone https://ghfast.top/https://github.com/karpathy/nanochat.git "$NANOCHAT_DIR"
fi
cd "$NANOCHAT_DIR"
# CN mirror patches — idempotent: sed only matches the upstream URLs.
sed -i \
-e "s|https://download.pytorch.org/whl/cu128|https://mirror.sjtu.edu.cn/pytorch-wheels/cu128|g" \
-e "s|https://download.pytorch.org/whl/cpu|https://mirror.sjtu.edu.cn/pytorch-wheels/cpu|g" \
pyproject.toml uv.lock
sed -i \
-e "s|https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle|https://hf-mirror.com/datasets/karpathy/climbmix-400b-shuffle|" \
nanochat/dataset.py
[ -d .venv ] || uv venv
uv sync --extra gpu --index-strategy unsafe-best-match
source .venv/bin/activate
echo "=== [1/4] download 1 climbmix shard ==="
time python -m nanochat.dataset -n 1
echo "=== [2/4] train tokenizer (50M chars) ==="
time python -m scripts.tok_train --max-chars=50000000
echo "=== [3/4] tok_eval ==="
time python -m scripts.tok_eval
echo "=== [4/4] base_train d=6 50 iters ==="
time python -m scripts.base_train \
--depth=6 \
--head-dim=64 \
--window-pattern=L \
--max-seq-len=512 \
--device-batch-size=8 \
--total-batch-size=4096 \
--eval-every=25 \
--eval-tokens=131072 \
--core-metric-every=-1 \
--sample-every=25 \
--num-iterations=50 \
--run=dummy
echo "=== smoke done ==="