1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00

Merge pull request #71 from cchan/patch-1

Zero-grad more aggressively to save memory
This commit is contained in:
Andrej 2023-01-20 14:38:10 -08:00 committed by GitHub
commit 3611338959
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -259,7 +259,6 @@ while True:
break break
# forward backward update, with optional gradient accumulation to simulate larger batch size # forward backward update, with optional gradient accumulation to simulate larger batch size
optimizer.zero_grad(set_to_none=True)
for micro_step in range(gradient_accumulation_steps): for micro_step in range(gradient_accumulation_steps):
X, Y = get_batch('train') X, Y = get_batch('train')
if ddp: if ddp:
@ -272,6 +271,7 @@ while True:
logits, loss = model(X, Y) logits, loss = model(X, Y)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
optimizer.zero_grad(set_to_none=True)
# timing and logging # timing and logging
t1 = time.time() t1 = time.time()