add torch.compile by default, shows almost 1.8X improvement in throughput nice

2025-10-19 17:47:39 +00:00 · 2022-12-30 00:07:13 +00:00
parent fb52554ca8
commit 5a725d9098
4 changed files with 19 additions and 3 deletions
--- a/train.py
+++ b/train.py
@@ -59,6 +59,7 @@ lr_decay_iters = 320000 # how many steps to decay the learning rate for
 min_lr = 1e-5 # minimum learning rate
 # DDP settings
 backend = 'nccl' # 'nccl', 'gloo', etc.
+compile_model = True # use PyTorch 2.0 to compile the model to be faster
 # -----------------------------------------------------------------------------
 # poor man's Configurator. Potentially a bad idea. Example usage:
 # $ python train.py override_file --batch_size=32
@@ -156,6 +157,12 @@ optimizer = model.configure_optimizers(weight_decay, learning_rate, betas)
 if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])

+# compile the model
+if compile_model:
+    print("compiling the model... (takes a ~minute)")
+    unoptimized_model = model
+    model = torch.compile(model) # requires PyTorch 2.0
+
 # wrap model into DDP container
 if ddp:
    model = DDP(model, device_ids=[gpu_id])