quick fix to not OOM main speedrun script
This commit is contained in:
+2
-2
@@ -58,8 +58,8 @@ python -m nanochat.dataset -n 8
|
|||||||
# See comment below for why 370 is the right number here
|
# See comment below for why 370 is the right number here
|
||||||
python -m nanochat.dataset -n 370 &
|
python -m nanochat.dataset -n 370 &
|
||||||
DATASET_DOWNLOAD_PID=$!
|
DATASET_DOWNLOAD_PID=$!
|
||||||
# train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data
|
# train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data
|
||||||
python -m scripts.tok_train --max-chars=2000000000 --vocab-size=65536
|
python -m scripts.tok_train
|
||||||
# evaluate the tokenizer (report compression ratio etc.)
|
# evaluate the tokenizer (report compression ratio etc.)
|
||||||
python -m scripts.tok_eval
|
python -m scripts.tok_eval
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from nanochat.dataset import parquets_iter_batched
|
|||||||
# Parse command line arguments
|
# Parse command line arguments
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Train a BPE tokenizer')
|
parser = argparse.ArgumentParser(description='Train a BPE tokenizer')
|
||||||
parser.add_argument('--max-chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)')
|
parser.add_argument('--max-chars', type=int, default=2_000_000_000, help='Maximum characters to train on (default: 10B)')
|
||||||
parser.add_argument('--doc-cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)')
|
parser.add_argument('--doc-cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)')
|
||||||
parser.add_argument('--vocab-size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)')
|
parser.add_argument('--vocab-size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
Reference in New Issue
Block a user