mirror of
				https://github.com/osmarks/nanogpt-experiments.git
				synced 2025-10-31 07:13:01 +00:00 
			
		
		
		
	add gradient accumulation support to simulate larger batch sizes. ty @VHellendoorn for original PR
This commit is contained in:
		
							
								
								
									
										25
									
								
								train.py
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								train.py
									
									
									
									
									
								
							| @@ -38,7 +38,8 @@ wandb_project = 'owt' | |||||||
| wandb_run_name = 'gpt2' # 'run' + str(time.time()) | wandb_run_name = 'gpt2' # 'run' + str(time.time()) | ||||||
| # data | # data | ||||||
| dataset = 'openwebtext' | dataset = 'openwebtext' | ||||||
| batch_size = 12 | gradient_accumulation_steps = 1 # used to simulate larger batch sizes | ||||||
|  | batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size | ||||||
| block_size = 1024 | block_size = 1024 | ||||||
| # model | # model | ||||||
| n_layer = 12 | n_layer = 12 | ||||||
| @@ -217,6 +218,7 @@ while True: | |||||||
|     else: |     else: | ||||||
|         lr = learning_rate |         lr = learning_rate | ||||||
|  |  | ||||||
|  |     # evaluate the loss on train/val sets and write checkpoints | ||||||
|     if iter_num % eval_interval == 0 and gpu_id == 0: |     if iter_num % eval_interval == 0 and gpu_id == 0: | ||||||
|         losses = estimate_loss() |         losses = estimate_loss() | ||||||
|         print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") |         print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") | ||||||
| @@ -244,20 +246,27 @@ while True: | |||||||
|     if iter_num == 0 and eval_only: |     if iter_num == 0 and eval_only: | ||||||
|         break |         break | ||||||
|  |  | ||||||
|     X, Y = get_batch('train') |     # forward backward update, with optional gradient accumulation to simulate larger batch size | ||||||
|     with ctx: |  | ||||||
|         logits, loss = model(X, Y) |  | ||||||
|  |  | ||||||
|     optimizer.zero_grad(set_to_none=True) |     optimizer.zero_grad(set_to_none=True) | ||||||
|     loss.backward() |     for micro_step in range(gradient_accumulation_steps): | ||||||
|     # TODO: gradient clipping evaluate need for |         X, Y = get_batch('train') | ||||||
|  |         if ddp: | ||||||
|  |             # in DDP training we only need to sync gradients at the last micro step. | ||||||
|  |             # the official way to do this is with model.no_sync() context manager, but | ||||||
|  |             # I really dislike that this bloats the code and forces us to repeat code | ||||||
|  |             # looking at the source of that context manager, it just toggles this variable | ||||||
|  |             model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1) | ||||||
|  |         with ctx: | ||||||
|  |             logits, loss = model(X, Y) | ||||||
|  |         loss.backward() | ||||||
|     optimizer.step() |     optimizer.step() | ||||||
|  |  | ||||||
|  |     # timing and logging | ||||||
|     t1 = time.time() |     t1 = time.time() | ||||||
|     dt = t1 - t0 |     dt = t1 - t0 | ||||||
|     t0 = t1 |     t0 = t1 | ||||||
|     if iter_num % log_interval == 0 and gpu_id == 0: |     if iter_num % log_interval == 0 and gpu_id == 0: | ||||||
|         lossf = loss.item() # loss as float. TODO CPU-GPU sync: profile, make sure not slow af |         lossf = loss.item() # loss as float. TODO note CPU-GPU sync! profile, make sure not too slow | ||||||
|         print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms") |         print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms") | ||||||
|     iter_num += 1 |     iter_num += 1 | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Andrej Karpathy
					Andrej Karpathy