fix bug in speedrun script, batch size that doesn't OOM on 8XH100 for d24 is 16
This commit is contained in:
+4
-7
@@ -69,13 +69,10 @@ python -m scripts.tok_eval
|
|||||||
echo "Waiting for dataset download to complete..."
|
echo "Waiting for dataset download to complete..."
|
||||||
wait $DATASET_DOWNLOAD_PID
|
wait $DATASET_DOWNLOAD_PID
|
||||||
|
|
||||||
# Number of processes/GPUs to use
|
|
||||||
NPROC_PER_NODE=8
|
|
||||||
|
|
||||||
# d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12)
|
# d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12)
|
||||||
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN
|
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --device-batch-size=16 --run=$WANDB_RUN
|
||||||
# evaluate the model: CORE metric, BPB on train/val, and draw samples
|
# evaluate the model: CORE metric, BPB on train/val, and draw samples
|
||||||
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
|
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# SFT (teach the model conversation special tokens, tool use, multiple choice)
|
# SFT (teach the model conversation special tokens, tool use, multiple choice)
|
||||||
@@ -85,8 +82,8 @@ torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
|
|||||||
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
|
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
|
||||||
|
|
||||||
# run SFT and eval the model
|
# run SFT and eval the model
|
||||||
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
|
torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 --run=$WANDB_RUN
|
||||||
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
|
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
|
||||||
|
|
||||||
# chat with the model over CLI! Leave out the -p to chat interactively
|
# chat with the model over CLI! Leave out the -p to chat interactively
|
||||||
# python -m scripts.chat_cli -p "Why is the sky blue?"
|
# python -m scripts.chat_cli -p "Why is the sky blue?"
|
||||||
|
|||||||
Reference in New Issue
Block a user