diff --git a/bench.py b/bench.py index c28d1df..9ebb280 100644 --- a/bench.py +++ b/bench.py @@ -7,7 +7,7 @@ import time import torch from model import GPTConfig, GPT -device = 'cuda:3' +device = 'cuda' torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn torch.manual_seed(1337) @@ -45,23 +45,52 @@ model.to(device) optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95)) -burn_in = 10 # number of burn in steps where we don't measure time -num_steps = 30 -for k in range(num_steps): +profile = False # use pytorch profiler, or just simple benchmarking? +if profile: + # useful docs on pytorch profiler: + # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html + # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile + wait, warmup, active = 5, 5, 5 + num_steps = wait + warmup + active + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], + schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'), + record_shapes=True, + profile_memory=True, + with_stack=True, # incurs an additional overhead, disable if not needed + with_flops=True, + with_modules=False, # only for torchscript models atm + ) as prof: - if k == burn_in: - t0 = time.time() # start the timer + for k in range(num_steps): + X, Y = get_batch('train') + with torch.autocast(device_type='cuda', dtype=torch.bfloat16): + logits, loss = model(X, Y) + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lossf = loss.item() + print(f"{k}/{num_steps} loss: {lossf:.4f}") - X, Y = get_batch('train') - with torch.autocast(device_type='cuda', dtype=torch.bfloat16): - logits, loss = model(X, Y) + prof.step() # notify the profiler at end of each step - optimizer.zero_grad(set_to_none=True) - loss.backward() - optimizer.step() - lossf = loss.item() - print(f"{k}/{num_steps} loss: {lossf:.4f}") +else: -torch.cuda.synchronize() -t1 = time.time() -print("time in ms per iteration: %.2f" % ((t1 - t0) / (num_steps - burn_in) * 1000)) + # simple benchmarking + torch.cuda.synchronize() + for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark + t0 = time.time() + for k in range(num_steps): + X, Y = get_batch('train') + with torch.autocast(device_type='cuda', dtype=torch.bfloat16): + logits, loss = model(X, Y) + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + lossf = loss.item() + print(f"{k}/{num_steps} loss: {lossf:.4f}") + torch.cuda.synchronize() + t1 = time.time() + if stage == 1: + print(f"time per iteration: {(t1-t0)/num_steps*1000:.4f}ms")