From bc1fca39f33074fec4319ef46d96e09b8024c824 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 15 Nov 2025 15:43:37 +0000 Subject: [PATCH] mqa -> gqa to reduce confusion --- nanochat/gpt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index b640f1e..8b220c3 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -8,7 +8,7 @@ Notable features: - norm after token embedding - no learnable params in rmsnorm - no bias in linear layers -- Multi-Query Attention (MQA) support for more efficient inference +- Group-Query Attention (GQA) support for more efficient inference """ import math @@ -29,7 +29,7 @@ class GPTConfig: vocab_size: int = 50304 n_layer: int = 12 n_head: int = 6 # number of query heads - n_kv_head: int = 6 # number of key/value heads (MQA) + n_kv_head: int = 6 # number of key/value heads (GQA) n_embd: int = 768