nanogpt-experiments/model.py

"""
Full definition of a GPT Language Model, all of it in this single file.
References:
1) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
2) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
"""

import math
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

@torch.jit.script
def fused_gelu(x):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                    .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = fused_gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.1

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.block_size = config.block_size

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss

    def crop_block_size(self, block_size):
        # model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        assert block_size <= self.block_size
        self.block_size = block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
        for block in self.transformer.h:
            block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    @classmethod
    def from_pretrained(cls, model_type):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        layer_config = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]

        # create a from-scratch initialized minGPT model
        config = GPTConfig(block_size=1024, **layer_config)
        model = GPT(config)
        sd = model.state_dict()

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(keys) == len(sd)
        for k in keys:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, betas):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx
first very bad commit 2022-12-28 00:58:19 +00:00			`"""`
			`Full definition of a GPT Language Model, all of it in this single file.`
			`References:`
			`1) the official GPT-2 TensorFlow implementation released by OpenAI:`
			`https://github.com/openai/gpt-2/blob/master/src/model.py`
			`2) huggingface/transformers PyTorch implementation:`
			`https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py`
			`"""`

			`import math`
			`from dataclasses import dataclass`

			`import torch`
			`import torch.nn as nn`
			`from torch.nn import functional as F`

			`@torch.jit.script`
			`def fused_gelu(x):`
			`"""`
			`Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).`
			`Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415`
			`"""`
			`return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))`

			`class CausalSelfAttention(nn.Module):`

			`def __init__(self, config):`
			`super().__init__()`
			`assert config.n_embd % config.n_head == 0`
			`# key, query, value projections for all heads, but in a batch`
			`self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)`
			`# output projection`
			`self.c_proj = nn.Linear(config.n_embd, config.n_embd)`
			`# regularization`
			`self.attn_dropout = nn.Dropout(config.dropout)`
			`self.resid_dropout = nn.Dropout(config.dropout)`
			`# causal mask to ensure that attention is only applied to the left in the input sequence`
			`self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))`
			`.view(1, 1, config.block_size, config.block_size))`
			`self.n_head = config.n_head`
			`self.n_embd = config.n_embd`

			`def forward(self, x):`
			`B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)`

			`# calculate query, key, values for all heads in batch and move head forward to be the batch dim`
			`q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)`
			`k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)`
			`q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)`
			`v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)`

			`# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)`
			`att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))`
			`att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))`
			`att = F.softmax(att, dim=-1)`
			`att = self.attn_dropout(att)`
			`y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)`
			`y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side`

			`# output projection`
			`y = self.resid_dropout(self.c_proj(y))`
			`return y`

			`class MLP(nn.Module):`

			`def __init__(self, config):`
			`super().__init__()`
			`self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)`
			`self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)`
			`self.dropout = nn.Dropout(config.dropout)`

			`def forward(self, x):`
			`x = self.c_fc(x)`
			`x = fused_gelu(x)`
			`x = self.c_proj(x)`
			`x = self.dropout(x)`
			`return x`

			`class Block(nn.Module):`

			`def __init__(self, config):`
			`super().__init__()`
			`self.ln_1 = nn.LayerNorm(config.n_embd)`
			`self.attn = CausalSelfAttention(config)`
			`self.ln_2 = nn.LayerNorm(config.n_embd)`
			`self.mlp = MLP(config)`

			`def forward(self, x):`
			`x = x + self.attn(self.ln_1(x))`
			`x = x + self.mlp(self.ln_2(x))`
			`return x`

			`@dataclass`
			`class GPTConfig:`
			`block_size: int = 1024`
			`vocab_size: int = 50257`
			`n_layer: int = 12`
			`n_head: int = 12`
			`n_embd: int = 768`
			`dropout: float = 0.1`

			`class GPT(nn.Module):`

			`def __init__(self, config):`
			`super().__init__()`
			`assert config.vocab_size is not None`
			`assert config.block_size is not None`
			`self.block_size = config.block_size`

			`self.transformer = nn.ModuleDict(dict(`
			`wte = nn.Embedding(config.vocab_size, config.n_embd),`
			`wpe = nn.Embedding(config.block_size, config.n_embd),`
			`drop = nn.Dropout(config.dropout),`
			`h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),`
			`ln_f = nn.LayerNorm(config.n_embd),`
			`))`
			`self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)`

			`# report number of parameters (note we don't count the decoder parameters in lm_head)`
			`n_params = sum(p.numel() for p in self.transformer.parameters())`
			`print("number of parameters: %.2fM" % (n_params/1e6,))`

			`def forward(self, idx, targets=None):`
			`device = idx.device`
			`b, t = idx.size()`
			`assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"`
			`pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)`

			`# forward the GPT model itself`
			`tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)`
			`pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)`
			`x = self.transformer.drop(tok_emb + pos_emb)`
			`for block in self.transformer.h:`
			`x = block(x)`
			`x = self.transformer.ln_f(x)`
			`logits = self.lm_head(x)`

			`# if we are given some desired targets also calculate the loss`
			`loss = None`
			`if targets is not None:`
			`loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)`

			`return logits, loss`

			`def crop_block_size(self, block_size):`
			`# model surgery to decrease the block size if necessary`
			`# e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)`
			`# but want to use a smaller block size for some smaller, simpler model`
			`assert block_size <= self.block_size`
			`self.block_size = block_size`
			`self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])`
			`for block in self.transformer.h:`
			`block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]`

			`@classmethod`
			`def from_pretrained(cls, model_type):`
			`assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}`
			`from transformers import GPT2LMHeadModel`
			`print("loading weights from pretrained gpt: %s" % model_type)`

			`layer_config = {`
			`'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params`
			`'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params`
			`'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params`
			`'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params`
			`}[model_type]`

			`# create a from-scratch initialized minGPT model`
			`config = GPTConfig(block_size=1024, **layer_config)`
			`model = GPT(config)`
			`sd = model.state_dict()`

			`# init a huggingface/transformers model`
			`model_hf = GPT2LMHeadModel.from_pretrained(model_type)`
			`sd_hf = model_hf.state_dict()`

			`# copy while ensuring all of the parameters are aligned and match in names and shapes`
			`keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these`
			`transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']`
			`# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear`
			`# this means that we have to transpose these weights when we import them`
			`assert len(keys) == len(sd)`
			`for k in keys:`
			`if any(k.endswith(w) for w in transposed):`
			`# special treatment for the Conv1D weights we need to transpose`
			`assert sd_hf[k].shape[::-1] == sd[k].shape`
			`with torch.no_grad():`
			`sd[k].copy_(sd_hf[k].t())`
			`else:`
			`# vanilla copy over the other parameters`
			`assert sd_hf[k].shape == sd[k].shape`
			`with torch.no_grad():`
			`sd[k].copy_(sd_hf[k])`

			`return model`

			`def configure_optimizers(self, weight_decay, learning_rate, betas):`
			`"""`
			`This long function is unfortunately doing something very simple and is being very defensive:`
			`We are separating out all parameters of the model into two buckets: those that will experience`
			`weight decay for regularization and those that won't (biases, and layernorm/embedding weights).`
			`We are then returning the PyTorch optimizer object.`
			`"""`

			`# separate out all parameters to those that will and won't experience regularizing weight decay`
			`decay = set()`
			`no_decay = set()`
			`whitelist_weight_modules = (torch.nn.Linear, )`
			`blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)`
			`for mn, m in self.named_modules():`
			`for pn, p in m.named_parameters():`
			`fpn = '%s.%s' % (mn, pn) if mn else pn # full param name`
			`# random note: because named_modules and named_parameters are recursive`
			`# we will see the same tensors p many many times. but doing it this way`
			`# allows us to know which parent module any tensor p belongs to...`
			`if pn.endswith('bias'):`
			`# all biases will not be decayed`
			`no_decay.add(fpn)`
			`elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):`
			`# weights of whitelist modules will be weight decayed`
			`decay.add(fpn)`
			`elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):`
			`# weights of blacklist modules will NOT be weight decayed`
			`no_decay.add(fpn)`

			`# validate that we considered every parameter`
			`param_dict = {pn: p for pn, p in self.named_parameters()}`
			`inter_params = decay & no_decay`
			`union_params = decay \| no_decay`
			`assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )`
			`assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \`
			`% (str(param_dict.keys() - union_params), )`

			`# create the pytorch optimizer object`
			`optim_groups = [`
			`{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},`
			`{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},`
			`]`
			`optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)`
			`return optimizer`

			`@torch.no_grad()`
			`def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):`
			`"""`
			`Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete`
			`the sequence max_new_tokens times, feeding the predictions back into the model each time.`
			`Most likely you'll want to make sure to be in model.eval() mode of operation for this.`
			`"""`
			`for _ in range(max_new_tokens):`
			`# if the sequence context is growing too long we must crop it at block_size`
			`idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]`
			`# forward the model to get the logits for the index in the sequence`
			`logits, _ = self(idx_cond)`
			`# pluck the logits at the final step and scale by desired temperature`
			`logits = logits[:, -1, :] / temperature`
			`# optionally crop the logits to only the top k options`
			`if top_k is not None:`
			`v, _ = torch.topk(logits, top_k)`
			`logits[logits < v[:, [-1]]] = -float('Inf')`
			`# apply softmax to convert logits to (normalized) probabilities`
			`probs = F.softmax(logits, dim=-1)`
			`# sample from the distribution`
			`idx_next = torch.multinomial(probs, num_samples=1)`
			`# append sampled index to the running sequence and continue`
			`idx = torch.cat((idx, idx_next), dim=1)`

			`return idx`