1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-12-18 14:10:28 +00:00

add data loading into benchmarking as well, just for completeness

This commit is contained in:
Andrej Karpathy 2022-12-29 00:05:32 +00:00
parent 70b5d93aee
commit b760ef1358
2 changed files with 26 additions and 7 deletions

View File

@ -60,4 +60,4 @@ I briefly tried finetuning gpt2 a bit more on our OWT and didn't notice dramatic
## benchmarking ## benchmarking
For model benchmarking `bench.py` might be useful. It's identical what happens in `train.py` except we're measuring just the fwd+bwd+update time of the model on a fixed random batch of data. For model benchmarking `bench.py` might be useful. It's identical what happens in the meat of the training loop of `train.py`, but omits much of the other complexities.

View File

@ -1,7 +1,8 @@
""" """
A much shorter version of train.py for benchmarking the model A much shorter version of train.py for benchmarking
""" """
import os
import numpy as np
import time import time
import torch import torch
from model import GPTConfig, GPT from model import GPTConfig, GPT
@ -14,6 +15,26 @@ torch.manual_seed(1337)
batch_size = 8 batch_size = 8
block_size = 1024 block_size = 1024
# data loading init
real_data = True
if real_data:
dataset = 'openwebtext'
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
def get_batch(split):
data = train_data # note ignore split in benchmarking script
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
x, y = x.to(device), y.to(device)
return x, y
else:
# alternatively, if fixed data is desired to not care about data loading
x = torch.randint(50257, (batch_size, block_size), device=device)
y = torch.randint(50257, (batch_size, block_size), device=device)
get_batch = lambda split: (x, y)
# model init
gptconf = GPTConfig( gptconf = GPTConfig(
block_size = block_size, # how far back does the model look? i.e. context size block_size = block_size, # how far back does the model look? i.e. context size
n_layer = 12, n_head = 12, n_embd = 768, # size of the model n_layer = 12, n_head = 12, n_embd = 768, # size of the model
@ -22,9 +43,6 @@ gptconf = GPTConfig(
model = GPT(gptconf) model = GPT(gptconf)
model.to(device) model.to(device)
x = torch.randint(50257, (batch_size, block_size), device=device)
y = torch.randint(50257, (batch_size, block_size), device=device)
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95)) optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95))
burn_in = 10 # number of burn in steps where we don't measure time burn_in = 10 # number of burn in steps where we don't measure time
@ -34,8 +52,9 @@ for k in range(num_steps):
if k == burn_in: if k == burn_in:
t0 = time.time() # start the timer t0 = time.time() # start the timer
X, Y = get_batch('train')
with torch.autocast(device_type='cuda', dtype=torch.bfloat16): with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
logits, loss = model(x, y) logits, loss = model(X, Y)
optimizer.zero_grad(set_to_none=True) optimizer.zero_grad(set_to_none=True)
loss.backward() loss.backward()