feat: pad vocab size to 64 for DDP optimizers and efficiency

2025-12-09 12:38:18 +01:00
parent d5759400f9
commit f1bf69d562
3 changed files with 14 additions and 6 deletions
@@ -26,6 +26,7 @@ def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads
    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
    def document_batches():
        parquet_paths = list_parquet_files()
+        assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?"
        parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
        resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
        resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None