mqa -> gqa to reduce confusion
This commit is contained in:
+2
-2
@@ -8,7 +8,7 @@ Notable features:
|
|||||||
- norm after token embedding
|
- norm after token embedding
|
||||||
- no learnable params in rmsnorm
|
- no learnable params in rmsnorm
|
||||||
- no bias in linear layers
|
- no bias in linear layers
|
||||||
- Multi-Query Attention (MQA) support for more efficient inference
|
- Group-Query Attention (GQA) support for more efficient inference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
@@ -29,7 +29,7 @@ class GPTConfig:
|
|||||||
vocab_size: int = 50304
|
vocab_size: int = 50304
|
||||||
n_layer: int = 12
|
n_layer: int = 12
|
||||||
n_head: int = 6 # number of query heads
|
n_head: int = 6 # number of query heads
|
||||||
n_kv_head: int = 6 # number of key/value heads (MQA)
|
n_kv_head: int = 6 # number of key/value heads (GQA)
|
||||||
n_embd: int = 768
|
n_embd: int = 768
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user