1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-11 04:19:57 +00:00

copy pasting what seems to work to bench,sample as well. ty @lantiga

This commit is contained in:
Andrej Karpathy 2023-01-08 19:32:13 +00:00
parent a855d316fd
commit b77c2e86d3
2 changed files with 25 additions and 15 deletions

View File

@ -2,23 +2,29 @@
A much shorter version of train.py for benchmarking A much shorter version of train.py for benchmarking
""" """
import os import os
from contextlib import nullcontext
import numpy as np import numpy as np
import time import time
import torch import torch
from model import GPTConfig, GPT from model import GPTConfig, GPT
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
device = 'cuda'
batch_size = 8 batch_size = 8
block_size = 1024 block_size = 1024
compile = True seed = 1337
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
compile = True # use PyTorch 2.0 to compile the model to be faster
exec(open('configurator.py').read()) # overrides from command line or config file exec(open('configurator.py').read()) # overrides from command line or config file
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
dtype = torch.bfloat16 # todo make configurable torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
torch.manual_seed(1337) device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# data loading init # data loading init
real_data = True real_data = True
@ -74,7 +80,7 @@ if profile:
for k in range(num_steps): for k in range(num_steps):
X, Y = get_batch('train') X, Y = get_batch('train')
with torch.autocast(device_type='cuda', dtype=dtype): with ctx:
logits, loss = model(X, Y) logits, loss = model(X, Y)
optimizer.zero_grad(set_to_none=True) optimizer.zero_grad(set_to_none=True)
loss.backward() loss.backward()
@ -92,7 +98,7 @@ else:
t0 = time.time() t0 = time.time()
for k in range(num_steps): for k in range(num_steps):
X, Y = get_batch('train') X, Y = get_batch('train')
with torch.autocast(device_type='cuda', dtype=dtype): with ctx:
logits, loss = model(X, Y) logits, loss = model(X, Y)
optimizer.zero_grad(set_to_none=True) optimizer.zero_grad(set_to_none=True)
loss.backward() loss.backward()

View File

@ -2,20 +2,22 @@
Sample from a trained model Sample from a trained model
""" """
import os import os
from contextlib import nullcontext
import torch import torch
import tiktoken import tiktoken
from model import GPTConfig, GPT from model import GPTConfig, GPT
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
out_dir = 'out' out_dir = 'out'
device = 'cuda'
compile = False
start = "\n" # or "<|endoftext|>" or whatever you like start = "\n" # or "<|endoftext|>" or whatever you like
num_samples = 10 # number of samples to draw num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # higher temperature (up to 1) is more random, lower (down to 0) means more greedy temperature = 0.8 # higher temperature (up to 1) is more random, lower (down to 0) means more greedy
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337 seed = 1337
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
exec(open('configurator.py').read()) # overrides from command line or config file exec(open('configurator.py').read()) # overrides from command line or config file
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -23,6 +25,9 @@ torch.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# model # model
ckpt_path = os.path.join(out_dir, 'ckpt.pt') ckpt_path = os.path.join(out_dir, 'ckpt.pt')
@ -45,11 +50,10 @@ enc = tiktoken.get_encoding("gpt2")
start_ids = enc.encode(start, allowed_special={"<|endoftext|>"}) start_ids = enc.encode(start, allowed_special={"<|endoftext|>"})
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
for k in range(num_samples): # run generation
with torch.no_grad():
with torch.no_grad(): with ctx:
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): for k in range(num_samples):
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
print(enc.decode(y[0].tolist()))
print(enc.decode(y[0].tolist())) print('---------------')
print('---------------')