document the legacy fineweb100b dataset and the new climbmix400b dataset

2026-03-03 17:24:31 +00:00
parent aba30cb037
commit b07604ebaa
1 changed files with 46 additions and 12 deletions
@@ -1,5 +1,5 @@
 """
-Repackage the FinewebEdu-100B dataset into shards:
+Repackage a given dataset into simple parquet shards:
 - each shard is ~100MB in size (after zstd compression)
 - parquets are written with row group size of 1000
@@ -10,6 +10,16 @@ The big deal is that our DataLoader will be able to stream
 the data and cache it along the way on disk, decreasing the
 training latency.
 Historical context:
 Originally, nanochat used the FinewebEdu-100B dataset.
 Then we switched to the ClimbMix-400B dataset due to superior performance.
 This script documents how both were prepared.
 The outputs are here:
 https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle
 https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle
 NOTE: This file is meant only as reference/documentation of the
 dataset preparation and it is not used during the project runtime.
 """
@@ -20,12 +30,37 @@ from datasets import load_dataset
 import pyarrow.parquet as pq
 import pyarrow as pa
 # You can change these:
 dataset_tag = "climbmix"
 upload_to_hf = True
 # Dataset configurations:
 if dataset_tag == "fineweb_edu":
    dataset_kwargs = {
        "path": "HuggingFaceFW/fineweb-edu",
        "split": "train",
        "name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
    }
    output_dirname = "fineweb_edu"
    data_column_name = "text"
    tokenizer = None
    upload_tag = "fineweb-edu-100b-shuffle"
 elif dataset_tag == "climbmix":
    import tiktoken # the ClimbMix data is stored tokenized with GPT-2 tokenizer
    dataset_kwargs = {
        "path": "nvidia/Nemotron-ClimbMix",
        "split": "train",
    }
    output_dirname = "climbmix"
    data_column_name = "tokens"
    tokenizer = tiktoken.encoding_for_model("gpt-2")
    upload_tag = "climbmix-400b-shuffle"
 else:
    raise ValueError(f"Unknown dataset tag: {dataset_tag}")
 # Source dataset
 dataset_kwargs = {
    "path": "HuggingFaceFW/fineweb-edu",
    "split": "train",
    "name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
 }
 ds = load_dataset(**dataset_kwargs)
 # Shuffle to scramble the order
@@ -34,7 +69,7 @@ ndocs = len(ds) # total number of documents to process
 print(f"Total number of documents: {ndocs}")
 # Repackage into parquet files
-output_dir = "/home/ubuntu/.cache/nanochat/base_data"
+output_dir = f"/home/ubuntu/.cache/nanochat/base_data_{output_dirname}"
 os.makedirs(output_dir, exist_ok=True)
 # Write to parquet files
@@ -47,7 +82,8 @@ total_docs_processed = 0
 total_time_spent = 0
 t0 = time.time()
 for doc in ds:
-    text = doc['text']
+    data = doc[data_column_name]
    text = tokenizer.decode(data) if tokenizer is not None else data
    shard_docs.append(text)
    shard_characters += len(text)
    collected_enough_chars = shard_characters >= chars_per_shard
@@ -79,14 +115,12 @@ for doc in ds:
        shard_index += 1
 # Demonstration of how the data was later uploaded to HuggingFace
-def upload():
+if upload_to_hf:
    import os
    from huggingface_hub import HfApi
    token = os.getenv("HF_TOKEN")
    api = HfApi(token=token)
    api.upload_large_folder(
        folder_path=output_dir,
-        repo_id="karpathy/fineweb-edu-100b-shuffle",
+        repo_id=f"karpathy/{upload_tag}",
        repo_type="dataset",
    )
 # upload()