mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-30 12:00:29 +00:00
first very bad commit
This commit is contained in:
commit
fe8042867c
34
README.md
Normal file
34
README.md
Normal file
@ -0,0 +1,34 @@
|
||||
|
||||
# nanoGPT
|
||||
|
||||
The cleanest, fastest repository for training/finetuning medium-sized GPTs.
|
||||
|
||||
This repo currently requires reading the code, but it's not that bad. work ongoing...
|
||||
|
||||
Getting started:
|
||||
|
||||
We need a few dependencies:
|
||||
|
||||
- [pytorch](https://pytorch.org), of course
|
||||
- numpy
|
||||
- `pip install datasets` for huggingface datasets
|
||||
- `pip install tiktoken` for OpenAI's fast bpe code
|
||||
- `pip install wandb` for optional logging
|
||||
|
||||
```
|
||||
$ cd data/openwebtext
|
||||
$ python prepare.py
|
||||
```
|
||||
|
||||
To download and tokenize the [openwebtext](https://huggingface.co/datasets/openwebtext) dataset. It will create a `train.bin` and `val.bin` which holds the GPT2 BPE token ids in a massive sequence. Then we're ready to kick off training. First open up train.py and read it, make sure the settings look ok. Then:
|
||||
|
||||
```
|
||||
$ python train.py
|
||||
```
|
||||
|
||||
Once some checkpoints are written to the output directory `out`, we're ready to sample from the model:
|
||||
|
||||
```
|
||||
$ python sample.py
|
||||
```
|
||||
|
84
data/openwebtext/prepare.py
Normal file
84
data/openwebtext/prepare.py
Normal file
@ -0,0 +1,84 @@
|
||||
# saves the openwebtext dataset to a binary file for training. following was helpful:
|
||||
# https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
|
||||
|
||||
import mmap
|
||||
import subprocess
|
||||
import numpy as np
|
||||
import tiktoken
|
||||
from datasets import load_dataset # huggingface datasets
|
||||
|
||||
# number of workers in .map() calls
|
||||
# good number to use is ~order num_cpu_cores()
|
||||
num_proc = 16
|
||||
|
||||
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
||||
dataset = load_dataset("openwebtext")
|
||||
|
||||
# owt by default only contains the 'train' split, so create a test split
|
||||
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
|
||||
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
|
||||
|
||||
# this results in:
|
||||
# >>> split_dataset
|
||||
# DatasetDict({
|
||||
# train: Dataset({
|
||||
# features: ['text'],
|
||||
# num_rows: 8009762
|
||||
# })
|
||||
# val: Dataset({
|
||||
# features: ['text'],
|
||||
# num_rows: 4007
|
||||
# })
|
||||
# })
|
||||
|
||||
# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
def process(example):
|
||||
ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
|
||||
ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
|
||||
out = {'ids': ids, 'len': len(ids)}
|
||||
return out
|
||||
|
||||
# tokenize the dataset
|
||||
tokenized = split_dataset.map(
|
||||
process,
|
||||
remove_columns=['text'],
|
||||
desc="tokenizing the splits",
|
||||
num_proc=num_proc,
|
||||
)
|
||||
|
||||
# concatenate all the ids in each dataset into one large file we can use for training
|
||||
for split, dset in tokenized.items():
|
||||
|
||||
offset = np.cumsum(dset['len']).tolist()
|
||||
total = offset[-1] # total number of tokens in the dataset
|
||||
dset = dset.add_column('offset', offset)
|
||||
|
||||
# preallocate space in a temporary file to store the concatenated ids
|
||||
filename = f'{split}.bin'
|
||||
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
||||
bytes_per_token = 2 # i.e. np.dtype(dtype).itemsize
|
||||
subprocess.run(['truncate', '-s', str(total * bytes_per_token), filename], check=True)
|
||||
|
||||
# write the ids to the file
|
||||
def write_to_file(example):
|
||||
with open(filename, 'r+b') as f:
|
||||
arr_len = len(example['ids'])
|
||||
start = example['offset'] - arr_len
|
||||
mm = mmap.mmap(f.fileno(), 0)
|
||||
arr = np.ndarray((arr_len,), dtype=dtype, buffer=mm, offset=bytes_per_token * start)
|
||||
arr[:] = example['ids']
|
||||
mm.flush()
|
||||
|
||||
dset.map(
|
||||
write_to_file,
|
||||
desc=f"writing {split} split to file {filename}",
|
||||
num_proc=num_proc,
|
||||
)
|
||||
|
||||
# train.bin is ~17GB, val.bin ~8.5MB
|
||||
# train has ~9B tokens (9,035,582,198)
|
||||
# val has ~4M tokens (4,434,897)
|
||||
|
||||
# to read the bin files later, e.g. with numpy:
|
||||
# m = np.memmap('train.bin', dtype=np.uint16, mode='r')
|
15
data/openwebtext/readme.md
Normal file
15
data/openwebtext/readme.md
Normal file
@ -0,0 +1,15 @@
|
||||
|
||||
## openwebtext dataset
|
||||
|
||||
after running `prepare.py` (preprocess) we get:
|
||||
|
||||
- train.bin is ~17GB, val.bin ~8.5MB
|
||||
- train has ~9B tokens (9,035,582,198)
|
||||
- val has ~4M tokens (4,434,897)
|
||||
|
||||
this came from 8,013,769 documents in total.
|
||||
|
||||
references:
|
||||
|
||||
- OpenAI's WebText dataset is discussed in [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
|
||||
- [OpenWebText](https://skylion007.github.io/OpenWebTextCorpus/) dataset
|
267
model.py
Normal file
267
model.py
Normal file
@ -0,0 +1,267 @@
|
||||
"""
|
||||
Full definition of a GPT Language Model, all of it in this single file.
|
||||
References:
|
||||
1) the official GPT-2 TensorFlow implementation released by OpenAI:
|
||||
https://github.com/openai/gpt-2/blob/master/src/model.py
|
||||
2) huggingface/transformers PyTorch implementation:
|
||||
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
|
||||
"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
@torch.jit.script
|
||||
def fused_gelu(x):
|
||||
"""
|
||||
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
|
||||
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
|
||||
|
||||
class CausalSelfAttention(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
assert config.n_embd % config.n_head == 0
|
||||
# key, query, value projections for all heads, but in a batch
|
||||
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
|
||||
# output projection
|
||||
self.c_proj = nn.Linear(config.n_embd, config.n_embd)
|
||||
# regularization
|
||||
self.attn_dropout = nn.Dropout(config.dropout)
|
||||
self.resid_dropout = nn.Dropout(config.dropout)
|
||||
# causal mask to ensure that attention is only applied to the left in the input sequence
|
||||
self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
|
||||
.view(1, 1, config.block_size, config.block_size))
|
||||
self.n_head = config.n_head
|
||||
self.n_embd = config.n_embd
|
||||
|
||||
def forward(self, x):
|
||||
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
||||
|
||||
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
||||
q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
|
||||
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
|
||||
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
||||
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
||||
att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
|
||||
att = F.softmax(att, dim=-1)
|
||||
att = self.attn_dropout(att)
|
||||
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
|
||||
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
|
||||
|
||||
# output projection
|
||||
y = self.resid_dropout(self.c_proj(y))
|
||||
return y
|
||||
|
||||
class MLP(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
|
||||
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
|
||||
self.dropout = nn.Dropout(config.dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.c_fc(x)
|
||||
x = fused_gelu(x)
|
||||
x = self.c_proj(x)
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
class Block(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.ln_1 = nn.LayerNorm(config.n_embd)
|
||||
self.attn = CausalSelfAttention(config)
|
||||
self.ln_2 = nn.LayerNorm(config.n_embd)
|
||||
self.mlp = MLP(config)
|
||||
|
||||
def forward(self, x):
|
||||
x = x + self.attn(self.ln_1(x))
|
||||
x = x + self.mlp(self.ln_2(x))
|
||||
return x
|
||||
|
||||
@dataclass
|
||||
class GPTConfig:
|
||||
block_size: int = 1024
|
||||
vocab_size: int = 50257
|
||||
n_layer: int = 12
|
||||
n_head: int = 12
|
||||
n_embd: int = 768
|
||||
dropout: float = 0.1
|
||||
|
||||
class GPT(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
assert config.vocab_size is not None
|
||||
assert config.block_size is not None
|
||||
self.block_size = config.block_size
|
||||
|
||||
self.transformer = nn.ModuleDict(dict(
|
||||
wte = nn.Embedding(config.vocab_size, config.n_embd),
|
||||
wpe = nn.Embedding(config.block_size, config.n_embd),
|
||||
drop = nn.Dropout(config.dropout),
|
||||
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
||||
ln_f = nn.LayerNorm(config.n_embd),
|
||||
))
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
|
||||
# report number of parameters (note we don't count the decoder parameters in lm_head)
|
||||
n_params = sum(p.numel() for p in self.transformer.parameters())
|
||||
print("number of parameters: %.2fM" % (n_params/1e6,))
|
||||
|
||||
def forward(self, idx, targets=None):
|
||||
device = idx.device
|
||||
b, t = idx.size()
|
||||
assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
|
||||
pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
|
||||
|
||||
# forward the GPT model itself
|
||||
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
||||
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
|
||||
x = self.transformer.drop(tok_emb + pos_emb)
|
||||
for block in self.transformer.h:
|
||||
x = block(x)
|
||||
x = self.transformer.ln_f(x)
|
||||
logits = self.lm_head(x)
|
||||
|
||||
# if we are given some desired targets also calculate the loss
|
||||
loss = None
|
||||
if targets is not None:
|
||||
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
|
||||
|
||||
return logits, loss
|
||||
|
||||
def crop_block_size(self, block_size):
|
||||
# model surgery to decrease the block size if necessary
|
||||
# e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
|
||||
# but want to use a smaller block size for some smaller, simpler model
|
||||
assert block_size <= self.block_size
|
||||
self.block_size = block_size
|
||||
self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
|
||||
for block in self.transformer.h:
|
||||
block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, model_type):
|
||||
assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
|
||||
from transformers import GPT2LMHeadModel
|
||||
print("loading weights from pretrained gpt: %s" % model_type)
|
||||
|
||||
layer_config = {
|
||||
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
|
||||
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
|
||||
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
|
||||
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
|
||||
}[model_type]
|
||||
|
||||
# create a from-scratch initialized minGPT model
|
||||
config = GPTConfig(block_size=1024, **layer_config)
|
||||
model = GPT(config)
|
||||
sd = model.state_dict()
|
||||
|
||||
# init a huggingface/transformers model
|
||||
model_hf = GPT2LMHeadModel.from_pretrained(model_type)
|
||||
sd_hf = model_hf.state_dict()
|
||||
|
||||
# copy while ensuring all of the parameters are aligned and match in names and shapes
|
||||
keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
|
||||
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
|
||||
# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
|
||||
# this means that we have to transpose these weights when we import them
|
||||
assert len(keys) == len(sd)
|
||||
for k in keys:
|
||||
if any(k.endswith(w) for w in transposed):
|
||||
# special treatment for the Conv1D weights we need to transpose
|
||||
assert sd_hf[k].shape[::-1] == sd[k].shape
|
||||
with torch.no_grad():
|
||||
sd[k].copy_(sd_hf[k].t())
|
||||
else:
|
||||
# vanilla copy over the other parameters
|
||||
assert sd_hf[k].shape == sd[k].shape
|
||||
with torch.no_grad():
|
||||
sd[k].copy_(sd_hf[k])
|
||||
|
||||
return model
|
||||
|
||||
def configure_optimizers(self, weight_decay, learning_rate, betas):
|
||||
"""
|
||||
This long function is unfortunately doing something very simple and is being very defensive:
|
||||
We are separating out all parameters of the model into two buckets: those that will experience
|
||||
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
|
||||
We are then returning the PyTorch optimizer object.
|
||||
"""
|
||||
|
||||
# separate out all parameters to those that will and won't experience regularizing weight decay
|
||||
decay = set()
|
||||
no_decay = set()
|
||||
whitelist_weight_modules = (torch.nn.Linear, )
|
||||
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
|
||||
for mn, m in self.named_modules():
|
||||
for pn, p in m.named_parameters():
|
||||
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
|
||||
# random note: because named_modules and named_parameters are recursive
|
||||
# we will see the same tensors p many many times. but doing it this way
|
||||
# allows us to know which parent module any tensor p belongs to...
|
||||
if pn.endswith('bias'):
|
||||
# all biases will not be decayed
|
||||
no_decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
|
||||
# weights of whitelist modules will be weight decayed
|
||||
decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
|
||||
# weights of blacklist modules will NOT be weight decayed
|
||||
no_decay.add(fpn)
|
||||
|
||||
# validate that we considered every parameter
|
||||
param_dict = {pn: p for pn, p in self.named_parameters()}
|
||||
inter_params = decay & no_decay
|
||||
union_params = decay | no_decay
|
||||
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
|
||||
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
|
||||
% (str(param_dict.keys() - union_params), )
|
||||
|
||||
# create the pytorch optimizer object
|
||||
optim_groups = [
|
||||
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
|
||||
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas)
|
||||
return optimizer
|
||||
|
||||
@torch.no_grad()
|
||||
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
||||
"""
|
||||
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
||||
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
||||
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
||||
"""
|
||||
for _ in range(max_new_tokens):
|
||||
# if the sequence context is growing too long we must crop it at block_size
|
||||
idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
|
||||
# forward the model to get the logits for the index in the sequence
|
||||
logits, _ = self(idx_cond)
|
||||
# pluck the logits at the final step and scale by desired temperature
|
||||
logits = logits[:, -1, :] / temperature
|
||||
# optionally crop the logits to only the top k options
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, top_k)
|
||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||
# apply softmax to convert logits to (normalized) probabilities
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
# sample from the distribution
|
||||
idx_next = torch.multinomial(probs, num_samples=1)
|
||||
# append sampled index to the running sequence and continue
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
|
||||
return idx
|
37
sample.py
Normal file
37
sample.py
Normal file
@ -0,0 +1,37 @@
|
||||
"""
|
||||
Sample from a trained model
|
||||
"""
|
||||
import os
|
||||
import torch
|
||||
import tiktoken
|
||||
from model import GPTConfig, GPT
|
||||
|
||||
device = 'cuda:2'
|
||||
torch.manual_seed(1337)
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
out_dir = 'out'
|
||||
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
|
||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||
|
||||
# model
|
||||
gptconf = GPTConfig(**checkpoint['model_args'])
|
||||
model = GPT(gptconf)
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
model.eval()
|
||||
model.to(device)
|
||||
|
||||
enc = tiktoken.get_encoding("gpt2")
|
||||
#start = enc.encode("\n")
|
||||
start = [enc.eot_token]
|
||||
x = (torch.tensor(start, dtype=torch.long, device=device)[None, ...])
|
||||
|
||||
for k in range(1):
|
||||
|
||||
with torch.no_grad():
|
||||
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
|
||||
y = model.generate(x, 300, temperature=0.8, top_k=200)
|
||||
|
||||
print(enc.decode(y[0].tolist()))
|
||||
print('---------------')
|
191
train.py
Normal file
191
train.py
Normal file
@ -0,0 +1,191 @@
|
||||
"""
|
||||
Train a GPT model on a dataset of text. One GPU version.
|
||||
The text is assumed to pre-tokenized and inside files train.pt and val.pt
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import wandb
|
||||
|
||||
from model import GPTConfig, GPT
|
||||
# -----------------------------------------------------------------------------
|
||||
# settings, todo argparse or something
|
||||
# I/O
|
||||
out_dir = 'out'
|
||||
eval_interval = 500
|
||||
log_interval = 1
|
||||
# wandb logging
|
||||
wandb_log = False
|
||||
wandb_entity = 'karpathy'
|
||||
wandb_project = 'owt'
|
||||
wandb_run_name = 'owt1' # 'run' + str(time.time())
|
||||
# data
|
||||
dataset = 'openwebtext'
|
||||
batch_size = 32
|
||||
block_size = 512
|
||||
# model
|
||||
device = 'cuda:0'
|
||||
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
|
||||
dropout = 0.1
|
||||
n_layer = 12
|
||||
n_head = 12
|
||||
n_embd = 768
|
||||
# adamw optimizer
|
||||
learning_rate = 2.5e-4 # max learning rate
|
||||
max_iters = 500000 # total number of training iterations
|
||||
weight_decay = 1e-2
|
||||
betas = (0.9, 0.95)
|
||||
# learning rate decay settings
|
||||
decay_lr = True # whether to decay the learning rate
|
||||
warmup_iters = 2000 # how many steps to warm up for
|
||||
lr_decay_iters = 320000 # how many steps to decay the learning rate for
|
||||
min_lr = 1e-5 # minimum learning rate
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
torch.manual_seed(1337)
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# poor man's data loader, TODO use real DataLoader...
|
||||
data_dir = os.path.join('data', dataset)
|
||||
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
|
||||
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
|
||||
def get_batch(split):
|
||||
data = train_data if split == 'train' else val_data
|
||||
ix = torch.randint(len(data) - block_size, (batch_size,))
|
||||
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
|
||||
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
|
||||
x, y = x.to(device), y.to(device)
|
||||
return x, y
|
||||
|
||||
# model init
|
||||
# TODO I don't love this whole part/API yet
|
||||
model_args = dict(n_layer = n_layer, n_head = n_head, n_embd = n_embd, block_size = block_size, dropout = dropout)
|
||||
if init_from == 'scratch':
|
||||
# init a new model from scratch
|
||||
gptconf = GPTConfig(**model_args)
|
||||
model = GPT(gptconf)
|
||||
elif init_from == 'resume':
|
||||
# resume training from a checkpoint. TODO: do we resume iter_num etc too? (yes...)
|
||||
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
|
||||
checkpoint = torch.load(ckpt_path)
|
||||
checkpoint_model_args = checkpoint['model_args']
|
||||
for k, v in model_args.items():
|
||||
assert checkpoint_model_args[k] == v, "for now"
|
||||
gptconf = GPTConfig(**model_args)
|
||||
model = GPT(gptconf)
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
elif init_from.startswith('gpt2'):
|
||||
# initialize from OpenAI GPT-2 weights
|
||||
model = GPT.from_pretrained(init_from)
|
||||
if block_size < model.block_size:
|
||||
model.crop_block_size(block_size)
|
||||
model.to(device)
|
||||
|
||||
@torch.no_grad()
|
||||
def estimate_loss(eval_iters=50):
|
||||
out = {}
|
||||
model.eval()
|
||||
for split in ['train', 'val']:
|
||||
losses = torch.zeros(eval_iters)
|
||||
for k in range(eval_iters):
|
||||
X, Y = get_batch(split)
|
||||
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
|
||||
logits, loss = model(X, Y)
|
||||
losses[k] = loss.item()
|
||||
out[split] = losses.mean()
|
||||
model.train()
|
||||
return out
|
||||
|
||||
# optimizer
|
||||
optimizer = model.configure_optimizers(weight_decay, learning_rate, betas)
|
||||
if init_from == 'resume':
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
|
||||
# learning rate decay scheduler (cosine with warmup)
|
||||
def get_lr(iter):
|
||||
# 1) linear warmup for warmup_iters steps
|
||||
if iter < warmup_iters:
|
||||
return learning_rate * iter / warmup_iters
|
||||
# 2) if iter > lr_decay_iters, return min learning rate
|
||||
if iter > lr_decay_iters:
|
||||
return min_lr
|
||||
# 3) in between, use cosine decay down to min learning rate
|
||||
decay_ratio = (iter - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # ranges 0..1
|
||||
return min_lr + coeff * (learning_rate - min_lr)
|
||||
|
||||
# logging
|
||||
if wandb_log:
|
||||
wandb.init(project=wandb_project, entity=wandb_entity, name=wandb_run_name)
|
||||
wandb.config = {
|
||||
"batch_size": batch_size,
|
||||
"block_size": block_size,
|
||||
"learning_rate": learning_rate, # TODO log everything else too
|
||||
}
|
||||
|
||||
# training loop
|
||||
iter_num = 0
|
||||
num_tokens = 0
|
||||
best_val_loss = 1e9
|
||||
t0 = time.time()
|
||||
while True:
|
||||
|
||||
# determine the learning rate for this iteration
|
||||
if decay_lr:
|
||||
lr = get_lr(iter_num)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
else:
|
||||
lr = learning_rate
|
||||
|
||||
if iter_num % eval_interval == 0:
|
||||
losses = estimate_loss()
|
||||
print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
||||
if wandb_log:
|
||||
wandb.log({
|
||||
"iter": iter_num,
|
||||
"num_tokens": num_tokens,
|
||||
"train/loss": losses['train'],
|
||||
"val/loss": losses['val'],
|
||||
"lr": lr,
|
||||
})
|
||||
if losses['val'] < best_val_loss:
|
||||
best_val_loss = losses['val']
|
||||
if iter_num > 0: # don't save checkpoints on very first iteration...
|
||||
checkpoint = {
|
||||
'model': model.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'model_args': model_args,
|
||||
'iter_num': iter_num,
|
||||
}
|
||||
torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
|
||||
|
||||
X, Y = get_batch('train')
|
||||
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
|
||||
logits, loss = model(X, Y)
|
||||
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
loss.backward()
|
||||
# TODO: gradient clipping
|
||||
optimizer.step()
|
||||
|
||||
t1 = time.time()
|
||||
dt = t1 - t0
|
||||
t0 = t1
|
||||
if iter_num % log_interval == 0:
|
||||
lossf = loss.item() # loss as float. TODO CPU-GPU sync: profile, make sure not slow af
|
||||
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
|
||||
iter_num += 1
|
||||
num_tokens += X.numel()
|
||||
|
||||
# termination conditions
|
||||
if iter_num >= max_iters:
|
||||
break
|
||||
|
Loading…
Reference in New Issue
Block a user