tune the data mixture a bit, load optimizer by default when SFT. These were confirmed to be best settings from sweeps of sft

This commit is contained in:
Andrej Karpathy
2026-02-16 20:23:04 +00:00
parent 77f8fb8303
commit 1415fb7617
2 changed files with 26 additions and 10 deletions
+3
View File
@@ -186,6 +186,9 @@ def load_optimizer_state(source, device, rank, model_tag=None, step=None):
if step is None:
step = find_last_step(checkpoint_dir)
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
if not os.path.exists(optimizer_path):
log0(f"Optimizer checkpoint not found: {optimizer_path}")
return None
log0(f"Loading optimizer state from {optimizer_path}")
optimizer_data = torch.load(optimizer_path, map_location=device)
return optimizer_data