tune the data mixture a bit, load optimizer by default when SFT. These were confirmed to be best settings from sweeps of sft
This commit is contained in:
@@ -186,6 +186,9 @@ def load_optimizer_state(source, device, rank, model_tag=None, step=None):
|
||||
if step is None:
|
||||
step = find_last_step(checkpoint_dir)
|
||||
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
|
||||
if not os.path.exists(optimizer_path):
|
||||
log0(f"Optimizer checkpoint not found: {optimizer_path}")
|
||||
return None
|
||||
log0(f"Loading optimizer state from {optimizer_path}")
|
||||
optimizer_data = torch.load(optimizer_path, map_location=device)
|
||||
return optimizer_data
|
||||
|
||||
Reference in New Issue
Block a user