mirror of
				https://github.com/osmarks/nanogpt-experiments.git
				synced 2025-10-26 12:57:41 +00:00 
			
		
		
		
	small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0
This commit is contained in:
		
							
								
								
									
										4
									
								
								model.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								model.py
									
									
									
									
									
								
							| @@ -173,10 +173,6 @@ class GPT(nn.Module): | ||||
|                 torch.nn.init.zeros_(module.bias) | ||||
|         elif isinstance(module, nn.Embedding): | ||||
|             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) | ||||
|         elif isinstance(module, (LayerNorm, nn.LayerNorm)): | ||||
|             torch.nn.init.ones_(module.weight) | ||||
|             if module.bias is not None: | ||||
|                 torch.nn.init.zeros_(module.bias) | ||||
|  | ||||
|     def forward(self, idx, targets=None): | ||||
|         device = idx.device | ||||
|   | ||||
							
								
								
									
										2
									
								
								train.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								train.py
									
									
									
									
									
								
							| @@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers? | ||||
| # adamw optimizer | ||||
| learning_rate = 6e-4 # max learning rate | ||||
| max_iters = 600000 # total number of training iterations | ||||
| weight_decay = 1e-2 | ||||
| weight_decay = 1e-1 | ||||
| beta1 = 0.9 | ||||
| beta2 = 0.95 | ||||
| grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Andrej Karpathy
					Andrej Karpathy