document the legacy fineweb100b dataset and the new climbmix400b dataset

This commit is contained in:
Andrej Karpathy
2026-03-03 17:24:31 +00:00
parent aba30cb037
commit b07604ebaa
+46 -12
View File
@@ -1,5 +1,5 @@
""" """
Repackage the FinewebEdu-100B dataset into shards: Repackage a given dataset into simple parquet shards:
- each shard is ~100MB in size (after zstd compression) - each shard is ~100MB in size (after zstd compression)
- parquets are written with row group size of 1000 - parquets are written with row group size of 1000
@@ -10,6 +10,16 @@ The big deal is that our DataLoader will be able to stream
the data and cache it along the way on disk, decreasing the the data and cache it along the way on disk, decreasing the
training latency. training latency.
Historical context:
Originally, nanochat used the FinewebEdu-100B dataset.
Then we switched to the ClimbMix-400B dataset due to superior performance.
This script documents how both were prepared.
The outputs are here:
https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle
https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle
NOTE: This file is meant only as reference/documentation of the NOTE: This file is meant only as reference/documentation of the
dataset preparation and it is not used during the project runtime. dataset preparation and it is not used during the project runtime.
""" """
@@ -20,12 +30,37 @@ from datasets import load_dataset
import pyarrow.parquet as pq import pyarrow.parquet as pq
import pyarrow as pa import pyarrow as pa
# You can change these:
dataset_tag = "climbmix"
upload_to_hf = True
# Dataset configurations:
if dataset_tag == "fineweb_edu":
dataset_kwargs = {
"path": "HuggingFaceFW/fineweb-edu",
"split": "train",
"name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
}
output_dirname = "fineweb_edu"
data_column_name = "text"
tokenizer = None
upload_tag = "fineweb-edu-100b-shuffle"
elif dataset_tag == "climbmix":
import tiktoken # the ClimbMix data is stored tokenized with GPT-2 tokenizer
dataset_kwargs = {
"path": "nvidia/Nemotron-ClimbMix",
"split": "train",
}
output_dirname = "climbmix"
data_column_name = "tokens"
tokenizer = tiktoken.encoding_for_model("gpt-2")
upload_tag = "climbmix-400b-shuffle"
else:
raise ValueError(f"Unknown dataset tag: {dataset_tag}")
# Source dataset # Source dataset
dataset_kwargs = {
"path": "HuggingFaceFW/fineweb-edu",
"split": "train",
"name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
}
ds = load_dataset(**dataset_kwargs) ds = load_dataset(**dataset_kwargs)
# Shuffle to scramble the order # Shuffle to scramble the order
@@ -34,7 +69,7 @@ ndocs = len(ds) # total number of documents to process
print(f"Total number of documents: {ndocs}") print(f"Total number of documents: {ndocs}")
# Repackage into parquet files # Repackage into parquet files
output_dir = "/home/ubuntu/.cache/nanochat/base_data" output_dir = f"/home/ubuntu/.cache/nanochat/base_data_{output_dirname}"
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
# Write to parquet files # Write to parquet files
@@ -47,7 +82,8 @@ total_docs_processed = 0
total_time_spent = 0 total_time_spent = 0
t0 = time.time() t0 = time.time()
for doc in ds: for doc in ds:
text = doc['text'] data = doc[data_column_name]
text = tokenizer.decode(data) if tokenizer is not None else data
shard_docs.append(text) shard_docs.append(text)
shard_characters += len(text) shard_characters += len(text)
collected_enough_chars = shard_characters >= chars_per_shard collected_enough_chars = shard_characters >= chars_per_shard
@@ -79,14 +115,12 @@ for doc in ds:
shard_index += 1 shard_index += 1
# Demonstration of how the data was later uploaded to HuggingFace # Demonstration of how the data was later uploaded to HuggingFace
def upload(): if upload_to_hf:
import os
from huggingface_hub import HfApi from huggingface_hub import HfApi
token = os.getenv("HF_TOKEN") token = os.getenv("HF_TOKEN")
api = HfApi(token=token) api = HfApi(token=token)
api.upload_large_folder( api.upload_large_folder(
folder_path=output_dir, folder_path=output_dir,
repo_id="karpathy/fineweb-edu-100b-shuffle", repo_id=f"karpathy/{upload_tag}",
repo_type="dataset", repo_type="dataset",
) )
# upload()