delete autocast, an unnecessary thorn in my side, manage dtypes directly

2026-03-04 23:55:24 +00:00
parent 752abc836e
commit 1076f97059
15 changed files with 258 additions and 167 deletions
@@ -7,7 +7,6 @@ python -m scripts.chat_cli
 import argparse
 import torch
 from nanochat.common import compute_init, autodetect_device_type
-from contextlib import nullcontext
 from nanochat.engine import Engine
 from nanochat.checkpoint_manager import load_model

@@ -19,15 +18,12 @@ parser.add_argument('-p', '--prompt', type=str, default='', help='Prompt the mod
 parser.add_argument('-t', '--temperature', type=float, default=0.6, help='Temperature for generation')
 parser.add_argument('-k', '--top-k', type=int, default=50, help='Top-k sampling parameter')
 parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
-parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
 args = parser.parse_args()

 # Init the model and tokenizer

 device_type = autodetect_device_type() if args.device_type == "" else args.device_type
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
 model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)

 # Special tokens for the chat state machine
@@ -87,12 +83,11 @@ while True:
    }
    response_tokens = []
    print("\nAssistant: ", end="", flush=True)
-    with autocast_ctx:
-        for token_column, token_masks in engine.generate(conversation_tokens, **generate_kwargs):
-            token = token_column[0] # pop the batch dimension (num_samples=1)
-            response_tokens.append(token)
-            token_text = tokenizer.decode([token])
-            print(token_text, end="", flush=True)
+    for token_column, token_masks in engine.generate(conversation_tokens, **generate_kwargs):
+        token = token_column[0] # pop the batch dimension (num_samples=1)
+        response_tokens.append(token)
+        token_text = tokenizer.decode([token])
+        print(token_text, end="", flush=True)
    print()
    # we have to ensure that the assistant end token is the last token
    # so even if generation ends due to max tokens, we have to append it to the end