fix bug where any rank has to be able to create checkpoint_dir if saving optim
This commit is contained in:
@@ -34,6 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data,
|
|||||||
logger.info(f"Saved metadata to: {meta_path}")
|
logger.info(f"Saved metadata to: {meta_path}")
|
||||||
# Note that optimizer state is sharded across ranks, so each rank must save its own.
|
# Note that optimizer state is sharded across ranks, so each rank must save its own.
|
||||||
if optimizer_data is not None:
|
if optimizer_data is not None:
|
||||||
|
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||||
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
|
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
|
||||||
torch.save(optimizer_data, optimizer_path)
|
torch.save(optimizer_data, optimizer_path)
|
||||||
logger.info(f"Saved optimizer state to: {optimizer_path}")
|
logger.info(f"Saved optimizer state to: {optimizer_path}")
|
||||||
|
|||||||
Reference in New Issue
Block a user