From 92c6654b9573362777daa44236b2269a75acf9b4 Mon Sep 17 00:00:00 2001 From: duwenjie Date: Sun, 21 Dec 2025 15:07:04 +0800 Subject: [PATCH 1/3] bugfix save and load ckpt from model_tag dir --- scripts/base_eval.py | 21 +++++++++++---------- scripts/chat_sft.py | 4 ++-- scripts/mid_train.py | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 3663538..f6070c4 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -27,6 +27,14 @@ from nanochat.tokenizer import HuggingFaceTokenizer from nanochat.checkpoint_manager import load_model from nanochat.core_eval import evaluate_task +# Configuration +hf_path = None # optional HuggingFace model path to evaluate +max_per_task = -1 # max examples per task to evaluate (-1 = disable) +model_tag = None # optional model tag for the output directory name +model_step = None # optional model step for the output directory name +device_type = "" # cuda|cpu|mps (empty => autodetect) +exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file + # ----------------------------------------------------------------------------- # nanochat specific function dealing with I/O etc. @@ -145,34 +153,27 @@ def load_hf_model(hf_path: str, device): # ----------------------------------------------------------------------------- def main(): - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') - parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') - args = parser.parse_args() - # distributed / precision setup device_type = autodetect_device_type() ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() # Load model and tokenizer from command line or from file system - if args.hf_path is not None: + if hf_path is not None: # atm assume that if a path is given, it's a huggingface model path - hf_path = args.hf_path print0(f"Loading huggingface model from: {hf_path}") model, tokenizer = load_hf_model(hf_path, device) model_name = hf_path # just for logging model_slug = hf_path.replace("/", "-") # for the output csv file else: # load a local model from the file system - model, tokenizer, meta = load_model("base", device, phase="eval") + model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=model_tag, step=model_step) model_name = f"base_model (step {meta['step']})" # just for logging model_slug = f"base_model_{meta['step']:06d}" # for the output csv file # Evaluate the model with autocast_ctx: - out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) + out = evaluate_model(model, tokenizer, device, max_per_task=max_per_task) # Write out the results to a csv file core_metric = None diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index f93a6e6..bb455a8 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -250,8 +250,8 @@ for step in range(num_iterations): if master_process: base_dir = get_base_dir() depth = model.config.n_layer - model_tag = f"d{depth}" # base the model tag on the depth of the base model - checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", model_tag) + output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12 + checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", output_dirname) model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer save_checkpoint( checkpoint_dir, diff --git a/scripts/mid_train.py b/scripts/mid_train.py index dd0768c..d817a40 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -207,7 +207,7 @@ while True: # save checkpoint at the end of the run (only on master process) if master_process and last_step and not dry_run: - output_dirname = f"d{depth}" # e.g. d12 + output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12 checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname) save_checkpoint( checkpoint_dir, From 78400491899569dfaa82dc8bd6a1c9299abc2bae Mon Sep 17 00:00:00 2001 From: DU Wenjie Date: Fri, 26 Dec 2025 17:29:08 +0800 Subject: [PATCH 2/3] bugfix keep same args style in scripts/base_eval.py --- scripts/base_eval.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index f6070c4..1d680a0 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -27,14 +27,6 @@ from nanochat.tokenizer import HuggingFaceTokenizer from nanochat.checkpoint_manager import load_model from nanochat.core_eval import evaluate_task -# Configuration -hf_path = None # optional HuggingFace model path to evaluate -max_per_task = -1 # max examples per task to evaluate (-1 = disable) -model_tag = None # optional model tag for the output directory name -model_step = None # optional model step for the output directory name -device_type = "" # cuda|cpu|mps (empty => autodetect) -exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file - # ----------------------------------------------------------------------------- # nanochat specific function dealing with I/O etc. @@ -153,27 +145,36 @@ def load_hf_model(hf_path: str, device): # ----------------------------------------------------------------------------- def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') + parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') + parser.add_argument('--model_tag', type=str, default=None, help='optional model tag for the output directory name') + parser.add_argument('--model_step', type=str, default=None, help='optional model step for the output directory name') + args = parser.parse_args() + # distributed / precision setup device_type = autodetect_device_type() ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() # Load model and tokenizer from command line or from file system - if hf_path is not None: + if args.hf_path is not None: # atm assume that if a path is given, it's a huggingface model path + hf_path = args.hf_path print0(f"Loading huggingface model from: {hf_path}") model, tokenizer = load_hf_model(hf_path, device) model_name = hf_path # just for logging model_slug = hf_path.replace("/", "-") # for the output csv file else: # load a local model from the file system - model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=model_tag, step=model_step) + model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step) model_name = f"base_model (step {meta['step']})" # just for logging model_slug = f"base_model_{meta['step']:06d}" # for the output csv file # Evaluate the model with autocast_ctx: - out = evaluate_model(model, tokenizer, device, max_per_task=max_per_task) + out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) # Write out the results to a csv file core_metric = None From ea4229851b6109b5b47aa7cfd467d20815947453 Mon Sep 17 00:00:00 2001 From: DU Wenjie Date: Fri, 26 Dec 2025 17:41:57 +0800 Subject: [PATCH 3/3] bugfix --- scripts/base_eval.py | 6 +++--- scripts/chat_rl.py | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 1d680a0..bd83ff3 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -149,8 +149,8 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') - parser.add_argument('--model_tag', type=str, default=None, help='optional model tag for the output directory name') - parser.add_argument('--model_step', type=str, default=None, help='optional model step for the output directory name') + parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name') + parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name') args = parser.parse_args() # distributed / precision setup @@ -168,7 +168,7 @@ def main(): model_slug = hf_path.replace("/", "-") # for the output csv file else: # load a local model from the file system - model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step) + model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step) model_name = f"base_model (step {meta['step']})" # just for logging model_slug = f"base_model_{meta['step']:06d}" # for the output csv file diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index bc78e79..e5c8d3f 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -31,6 +31,8 @@ from tasks.gsm8k import GSM8K # RL hyperparameters run = "dummy" # wandb run name source = "sft" # mid|sft +model_tag = None # model tag to load the model from (base model or midtrained model) +step = None # step to load the model from (base model or midtrained model) dtype = "bfloat16" device_batch_size = 8 # no forward pass will go above this to not OOM examples_per_step = 16 # in total and across all ranks (note: examples, not samples/completions!) @@ -64,7 +66,7 @@ use_dummy_wandb = run == "dummy" or not master_process wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-rl", name=run, config=user_config) # Init model and tokenizer -model, tokenizer, meta = load_model(source, device, phase="eval") +model, tokenizer, meta = load_model(source, device, phase="eval", model_tag=model_tag, step=step) engine = Engine(model, tokenizer) # for sampling rollouts # ----------------------------------------------------------------------------- @@ -307,8 +309,8 @@ for step in range(num_steps): if master_process and ((step > 0 and step % save_every == 0) or step == num_steps - 1): base_dir = get_base_dir() depth = model.config.n_layer - model_tag = f"d{depth}" # base the model tag on the depth of the base model - checkpoint_dir = os.path.join(base_dir, "chatrl_checkpoints", model_tag) + output_dirname = model_tag if model_tag else f"d{depth}" # base the model tag on the depth of the base model + checkpoint_dir = os.path.join(base_dir, "chatrl_checkpoints", output_dirname) model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer save_checkpoint( checkpoint_dir,