Merge pull request #71 from cchan/patch-1

Zero-grad more aggressively to save memory
2025-07-01 09:32:50 +00:00 · 2023-01-20 14:38:10 -08:00 · 2023-01-20 14:38:10 -08:00 · 3611338959
commit 3611338959
parent 1f77d03024 67166079c9
1 changed files with 1 additions and 1 deletions
--- a/train.py
+++ b/train.py
@ -259,7 +259,6 @@ while True:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
-    optimizer.zero_grad(set_to_none=True)
    for micro_step in range(gradient_accumulation_steps):
        X, Y = get_batch('train')
        if ddp:
@ -272,6 +271,7 @@ while True:
            logits, loss = model(X, Y)
        loss.backward()
    optimizer.step()
+    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()