Note that our original parameter counts were wrong due to an error (in our previous blog posts and paper). Thus you may have seen small referred to as 117M and medium referred to as 345M.
以 GPT-2 124M (后文简称 GPT-2) 为例,具体超参数配置如下:
Components
Hyperparameters
Value
Notation
Description
Input Embedding
vocab_size
50257
V
number of tokens
Positional Embedding
n_positions
1024
T
maximum sequence length
Nearly every sublayer
n_embd
768
C
embedding dimension
-
n_layer
12
N
number of block layers
Self Attention
n_head
12
H
number of attention heads
Residual Connections
resid_pdrop
0.1
presid
dropout probability for residual connections
Embedding Layer
embd_pdrop
0.1
pembd
dropout probability for embedding layer
Self Attention
attn_pdrop
0.1
pattn
dropout probability for attention weights
Layer Norm
layer_norm_epsilon
1e-5
ϵ
layer norm epsilon
-
initializer_range
0.02
σ
standard deviation of weight initializer
Layer Norm and Linears
bais
True
-
whether to include bias in the Linears and LayerNorms
实现如下:
from dataclasses import dataclass
@dataclass classGPTConfig: # number of block layers n_layer: int = 12 # number of attention heads n_head: int = 12 # embedding dimension n_embd: int = 768 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token vocab_size: int = 50257 # maximum sequence length n_positions: int = 1024 # dropout probability for embedding layer embd_pdrop: float = 0.1 # dropout probability for residual connections resid_pdrop: float = 0.1 # dropout probability for attention weights attn_pdrop: float = 0.1 # layer norm epsilon layer_norm_epsilon: float = 1e-5 # std of weight initializer initializer_range: float = 0.02 # whether to include bias in the Linears and LayerNorms bias: bool = True
def__init__( self, n_embd: int, n_positions: int, n_head: int, attn_pdrop: float = 0.1, resid_pdrop: float = 0.1, bias: bool = True, ) -> None: """Initialize the module. Args: n_embd (int): Embedding dimension. n_positions (int): Maximum sequence length. n_head (int): Number of attention heads. attn_pdrop (float, optional): Dropout probability for attention weights. Defaults to 0.1. resid_pdrop (float, optional): Dropout probability for residual connections. Defaults to 0.1. bias (bool, optional): Whether to include bias terms when calculating k, q, v projections. Defaults to True. """ super().__init__() assert n_embd % n_head == 0# n_embd must be divisible by n_head # key, query, value projections for all heads, but in a batch self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias) # output projection self.c_proj = nn.Linear(n_embd, n_embd, bias=bias) self.c_proj.NANOGPT_SCALE_INIT = 1# special scaled initialization # regularization self.attn_dropout = nn.Dropout(attn_pdrop) self.resid_dropout = nn.Dropout(resid_pdrop) self.n_head = n_head # precompute and cache mask self.register_buffer( "mask", torch.tril(torch.ones(n_positions, n_positions)).view( 1, 1, n_positions, n_positions ), )
defforward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass. Args: x (torch.Tensor): Input tensor of shape (batch_size, seq_len, n_embd). Returns: torch.Tensor: Output tensor of the same shape as input. """ # B: batch size, T: sequence length, C: embedding dimension (=n_embd) B, T, C = x.size()
# calculate q, k, v for all heads in batch # (B, T, C) -> (B, T, 3C) -> (B, T, C) x 3 q, k, v = self.c_attn(x).split(C, dim=-1) # move head dim forward to be the batch dim # (B, T, C) -> (B, T, nh, hs) -> (B, nh, T, hs) # C = nh * hs, where nh: number of heads, hs: head size, q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
attn_weights = (q @ k.transpose(-2, -1)) * ( 1.0 / math.sqrt(k.size(-1)) # scaling factor ) # (B, nh, T, hs) x (B, nh, hs, T) = (B, nh, T, T) attn_weights.masked_fill_(self.mask[:, :, :T, :T] == 0, float("-inf")) attn_weights = torch.softmax(attn_weights, dim=-1) attn_weights = self.attn_dropout(attn_weights) # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) y = attn_weights @ v # re-assemble all head outputs side by side y = ( y.transpose(1, 2) # (B, T, nh, hs) .contiguous() # equivalent to `.reshape(B, T, C)` .view(B, T, C) # (B, T, C) )
# output projection returnself.resid_dropout(self.c_proj(y)) # (B, T, C)
该代码实现与前文给出的数学公式不尽相同,主要差异集中在对 Q,K,V 的计算:
q, k, v = self.c_attn(x).split(C, dim=-1): 通过线性层 self.c_attn 直接将输入维度从 C 映射至 3C,相当于直接计算连接后的 QKV,接着通过 split 拆分成单独的 Q,K,V
Encoder Self-Attention: 前向传播时设置 is_causal=False,从而不使用 causal mask。同时,可以指定 query, key, value 来自同一输入
Decoder Causal Self-Attention: 前向传播时设置 is_causal=True。同时,可以指定 query, key, value 来自同一输入
为了支持 Flash Attention,修改代码如下:
classCausalSelfAttention(nn.Module):
def__init__( self, n_embd: int, n_positions: int, n_head: int, attn_pdrop: float = 0.1, resid_pdrop: float = 0.1, bias: bool = True, ) -> None: """Initialize the module. Args: n_embd (int): Embedding dimension. n_positions (int): Maximum sequence length. n_head (int): Number of attention heads. attn_pdrop (float, optional): Dropout probability for attention weights. Defaults to 0.1. resid_pdrop (float, optional): Dropout probability for residual connections. Defaults to 0.1. bias (bool, optional): Whether to include bias terms when calculating k, q, v projections. Defaults to True. """ super().__init__() assert n_embd % n_head == 0# n_embd must be divisible by n_head # key, query, value projections for all heads, but in a batch self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias) # output projection self.c_proj = nn.Linear(n_embd, n_embd, bias=bias) self.c_proj.NANOGPT_SCALE_INIT = 1# special scaled initialization # regularization self.attn_dropout = nn.Dropout(attn_pdrop) self.attn_pdrop = attn_pdrop # save for Flash Attention self.resid_dropout = nn.Dropout(resid_pdrop) self.n_head = n_head # flash attention, supported only in PyTorch >= 2.0 self.flash = hasattr(F, "scaled_dot_product_attention") ifnotself.flash: print( "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0" ) # precompute and cache mask self.register_buffer( "mask", torch.tril(torch.ones(n_positions, n_positions)).view( 1, 1, n_positions, n_positions ), )
defforward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass. Args: x (torch.Tensor): Input tensor of shape (batch_size, seq_len, n_embd). Returns: torch.Tensor: Output tensor of the same shape as input. """ # B: batch size, T: sequence length, C: embedding dimension (=n_embd) B, T, C = x.size()
# calculate q, k, v for all heads in batch # (B, T, C) -> (B, T, 3C) -> (B, T, C) x 3 q, k, v = self.c_attn(x).split(C, dim=-1) # move head dim forward to be the batch dim # (B, T, C) -> (B, T, nh, hs) -> (B, nh, T, hs) # C = nh * hs, where nh: number of heads, hs: head size, q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) ifself.flash: # efficient attention using Flash Attention CUDA kernels y = F.scaled_dot_product_attention( q, k, v, dropout_p=self.attn_pdrop ifself.training else0, is_causal=True, ) else: attn_weights = (q @ k.transpose(-2, -1)) * ( 1.0 / math.sqrt(k.size(-1)) # scaling factor ) # (B, nh, T, hs) x (B, nh, hs, T) = (B, nh, T, T) attn_weights.masked_fill_( self.mask[:, :, :T, :T] == 0, float("-inf") ) attn_weights = torch.softmax(attn_weights, dim=-1) attn_weights = self.attn_dropout(attn_weights) # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) y = attn_weights @ v # re-assemble all head outputs side by side y = ( y.transpose(1, 2) # (B, T, nh, hs) .contiguous() # equivalent to `.reshape(B, T, C)` .view(B, T, C) # (B, T, C) )
# output projection returnself.resid_dropout(self.c_proj(y)) # (B, T, C)
def__init__( self, n_embd: int, resid_pdrop: float = 0.1, bias: bool = True ) -> None: """Initialize the module. Args: n_embd (int): Embedding dimension. resid_pdrop (float, optional): Dropout probability for residual connections. Defaults to 0.1. bias (bool, optional): Whether to include bias terms in the linear layers. Defaults to True. """ super().__init__() self.c_fc = nn.Linear(n_embd, 4 * n_embd, bias) self.act = nn.GELU("tanh") self.c_proj = nn.Linear(4 * n_embd, n_embd, bias) self.c_proj.NANOGPT_SCALE_INIT = 1# special scaled initialization self.dropout = nn.Dropout(resid_pdrop)
defforward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass. Args: x (torch.Tensor): Input tensor of shape (batch_size, seq_len, n_embd). Returns: torch.Tensor: Output tensor of the same shape as input. """ x = self.c_fc(x) x = self.act(x) x = self.c_proj(x) x = self.dropout(x) return x
def__init__( self, n_embd: int, n_positions: int, n_head: int, attn_pdrop: float = 0.1, resid_pdrop: float = 0.1, layer_norm_epsilon: float = 1e-5, bias: bool = True, ) -> None: """Initialize the module. Args: n_embd (int): Embedding dimension. n_positions (int): Maximum sequence length. n_head (int): Number of attention heads. attn_pdrop (float, optional): Dropout probability for attention weights. Defaults to 0.1. resid_pdrop (float, optional): Dropout probability for residual connections. Defaults to 0.1. layer_norm_epsilon (float, optional): Layer norm epsilon. Defaults to 1e-5. bias (bool, optional): Whether to include bias terms in the layers. Defaults to True. """ super().__init__() self.ln_1 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon, bias=bias) self.attn = CausalSelfAttention( n_embd, n_positions, n_head, attn_pdrop, resid_pdrop, bias ) self.ln_2 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon, bias=bias) self.mlp = MLP(n_embd, resid_pdrop)
defforward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass. Args: x (torch.Tensor): Input tensor of shape (batch_size, seq_len, n_embd). Returns: torch.Tensor: Output tensor of the same shape as input. """ x = x + self.attn(self.ln_1(x)) x = x + self.mlp(self.ln_2(x)) return x
def_init_params(self, module: nn.Module) -> None: """Initialize the parameters of modules. 1. Linear: - Weights: Normal(mean=0.0, std=self.initializer_range) If module has `NANOGPT_SCALE_INIT` (e.g., `c_proj` layers in `CausalSelfAttention` and `MLP`), `std` will be scaled by `(2 * self.n_layer) ** -0.5` - Biases: Zeros if exists 2. Embedding: - Weights: Normal(mean=0.0, std=self.initializer_range) 3. LayerNorm: - Weights: Ones - Biases: Zeros Args: module (nn.Module): Modules to initialize. """ ifisinstance(module, nn.Linear): std = self.initializer_range # special scaled initialization ifhasattr(module, "NANOGPT_SCALE_INIT"): std *= (2 * self.n_layer) ** -0.5 nn.init.normal_(module.weight, mean=0.0, std=std) if module.bias isnotNone: nn.init.zeros_(module.bias) elifisinstance(module, nn.Embedding): nn.init.normal_(module.weight, mean=0.0, std=self.initializer_range) elifisinstance(module, nn.LayerNorm): nn.init.zeros_(module.bias) nn.init.ones_(module.weight)
defforward( self, idx: torch.Tensor, targets: torch.Tensor = None ) -> tuple[torch.Tensor, torch.Tensor | None]: """Forward pass. Args: idx (torch.Tensor): Token indices of shape (B, T), where: - B: batch size - T: sequence length. targets (torch.Tensor, optional): Ground truth token indices of shape (B, T). If provided, the loss is calculated using cross entropy. Defaults to None. Returns: tuple[torch.Tensor, torch.Tensor | None]: A tuple containing: - logits (torch.Tensor): Output tensor of shape (B, T, vocab_size) containing the unnormalized log probabilities for each token in the vocabulary. - loss (torch.Tensor | None): The computed cross entropy loss if targets is provided, otherwise None. """ T = idx.size(1) # (B, T) assert ( T <= self.n_positions ), f"Cannot forward sequence of length {T}, block size is only {self.n_positions}" # forward the position embeddings pos = torch.arange(T, device=idx.device) # (T,) pos_emb = self.transformer.wpe(pos) # (T, C) # forward the token embeddings tok_emb = self.transformer.wte(idx) # (B, T, C) x = tok_emb + pos_emb # (B, T, C) # forward the blocks of the transformer for block inself.transformer.h: x = block(x) # forward the final layernorm and the classifier x = self.transformer.ln_f(x) logits = self.lm_head(x) # (B, T, vocab_size) # calculate loss if targets are provided loss: torch.Tensor | None = None if targets isnotNone: loss = F.cross_entropy( input=logits.view(-1, logits.size(-1)), # (B*T, vocab_size) target=targets.view(-1), # (B*T,) ) return logits, loss
@classmethod deffrom_pretrained(cls, model_type: str) -> "GPT": """Load pretrained GPT-2 model weights from huggingface. Args: model_type (str): Model type to load. Must be one of {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"} Returns: GPT: Pretrained GPT-2 model. """ assert model_type in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"} from transformers import GPT2LMHeadModel from dataclasses import asdict
print("loading weights from pretrained gpt: %s" % model_type)
# n_layer, n_head and n_embd are determined from model_type cfg_args: dict[str, Any] = { "gpt2": dict(n_layer=12, n_head=12, n_embd=768), # 124M params "gpt2-medium": dict( n_layer=24, n_head=16, n_embd=1024 ), # 350M params "gpt2-large": dict( n_layer=36, n_head=20, n_embd=1280 ), # 774M params "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params }[model_type] # create a from-scratch initialized minGPT model model = GPT(**asdict(GPTConfig(**cfg_args))) sd = model.state_dict() sd_keys = sd.keys() sd_keys = [ k for k in sd_keys ifnot k.endswith(".attn.mask") ] # discard this mask / buffer, not a param
# init a huggingface/transformers model model_hf = GPT2LMHeadModel.from_pretrained(model_type) sd_hf = model_hf.state_dict()
# copy while ensuring all of the parameters are aligned and match in names and shapes sd_keys_hf = sd_hf.keys() sd_keys_hf = [ k for k in sd_keys_hf ifnot k.endswith(".attn.masked_bias") ] # ignore these, just a buffer sd_keys_hf = [ k for k in sd_keys_hf ifnot k.endswith(".attn.bias") ] # same, just the mask (buffer)
# basically the openai checkpoints use a `Conv1D` module, but we only want to use a vanilla Linear # this means that we have to transpose these weights when we import them transposed = [ "attn.c_attn.weight", "attn.c_proj.weight", "mlp.c_fc.weight", "mlp.c_proj.weight", ] assertlen(sd_keys_hf) == len( sd_keys ), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}" for k in sd_keys_hf: ifany(k.endswith(w) for w in transposed): # special treatment for the Conv1D weights we need to transpose assert sd_hf[k].shape[::-1] == sd[k].shape with torch.no_grad(): sd[k].copy_(sd_hf[k].t()) else: # vanilla copy over the other parameters assert sd_hf[k].shape == sd[k].shape with torch.no_grad(): sd[k].copy_(sd_hf[k]) return model
self.transformer = nn.ModuleDict( dict( # token and position embeddings wte=nn.Embedding(vocab_size, n_embd), wpe=nn.Embedding(n_positions, n_embd), # ... ) )
输入嵌入矩阵为 E=[e1,e2,…,eV]⊤∈RV×C,当输入 token index xi 时,将其映射成词向量 ei=E[xi]∈RC。位置嵌入矩阵 P={p1,p2,…,pT}∈RT×C,其中:pi∈RC 为第 i 个位置的嵌入向量,C 为嵌入维度。简单起见,这里用 T 表示序列最大长度,而代码中的 T 为序列实际长度。位置嵌入矩阵与 token indices 无关,完全由位置 i 决定。因此,在具体实现中,无需以 X 为输入:
defforward( self, idx: torch.Tensor, targets: torch.Tensor = None ) -> tuple[torch.Tensor, torch.Tensor | None]: T = idx.size(1) # (B, T) # forward the position embeddings pos = torch.arange(T, device=idx.device) # (T,) pos_emb = self.transformer.wpe(pos) # (T, C) # forward the token embeddings tok_emb = self.transformer.wte(idx) # (B, T, C) x = tok_emb + pos_emb # (B, T, C)
self.transformer = nn.ModuleDict( dict( # ... ln_f=nn.LayerNorm(n_embd, eps=layer_norm_epsilon, bias=bias), ) ) # language model head, bias is set to False to support the weight sharing scheme self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # weight sharing scheme self.transformer.wte.weight = self.lm_head.weight
首先,经过一个无偏置 b 的全连接层:O=HW。其中:W∈RC×V 为从隐藏态映射至目标语言词汇表大小的矩阵。实际实现中,该权重矩阵与输入嵌入矩阵共享,二者互为转置,即:W=E⊤。权重共享有以下优势:
# basically the openai checkpoints use a `Conv1D` module, but we only want to use a vanilla Linear # this means that we have to transpose these weights when we import them transposed = [ "attn.c_attn.weight", "attn.c_proj.weight", "mlp.c_fc.weight", "mlp.c_proj.weight", ] assertlen(sd_keys_hf) == len( sd_keys ), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}" for k in sd_keys_hf: ifany(k.endswith(w) for w in transposed): # special treatment for the Conv1D weights we need to transpose assert sd_hf[k].shape[::-1] == sd[k].shape with torch.no_grad(): sd[k].copy_(sd_hf[k].t()) else: # vanilla copy over the other parameters assert sd_hf[k].shape == sd[k].shape with torch.no_grad(): sd[k].copy_(sd_hf[k]) return model
定义需要特殊处理的权重名称列表 transposed。这些权重在 Hugging Face 的实现中使用 Conv1D 模块,需要进行转置