1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2025-08-29 17:07:56 +00:00

Zero-grad more aggressively to save memory

This commit is contained in:
Clive Chan
2023-01-19 22:10:44 -08:00
committed by GitHub
parent 2c7806db6e
commit 67166079c9

View File

@@ -259,7 +259,6 @@ while True:
break
# forward backward update, with optional gradient accumulation to simulate larger batch size
optimizer.zero_grad(set_to_none=True)
for micro_step in range(gradient_accumulation_steps):
X, Y = get_batch('train')
if ddp:
@@ -272,6 +271,7 @@ while True:
logits, loss = model(X, Y)
loss.backward()
optimizer.step()
optimizer.zero_grad(set_to_none=True)
# timing and logging
t1 = time.time()