minor args re-arranging and removing some spurious ones like wandb entity ty @tcapelle

2025-01-05 15:00:28 +00:00 · 2023-01-05 01:14:02 +00:00 · 2023-01-05 01:14:02 +00:00 · 9629093e53
commit 9629093e53
parent 529c967a65
2 changed files with 8 additions and 9 deletions
--- a/sample.py
+++ b/sample.py
@ -8,7 +8,7 @@ from model import GPTConfig, GPT

 # -----------------------------------------------------------------------------
 out_dir = 'out'
-device = 'cuda:2'
+device = 'cuda'
 compile = False
 start = "\n" # or "<|endoftext|>" or whatever you like
 num_samples = 10 # number of samples to draw
--- a/train.py
+++ b/train.py
@ -10,7 +10,6 @@ $ torchrun --standalone --nproc_per_node=4 train.py
 """

 import os
-import sys
 import time
 import math

@ -31,9 +30,9 @@ log_interval = 1
 eval_iters = 200
 eval_only = False # if True, script exits right after the first eval
 always_save_checkpoint = True # if True, always save a checkpoint after each eval
+init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
 # wandb logging
 wandb_log = False # disabled by default
-wandb_entity = 'karpathy'
 wandb_project = 'owt'
 wandb_run_name = 'gpt2' # 'run' + str(time.time())
 # data
@ -41,24 +40,24 @@ dataset = 'openwebtext'
 batch_size = 12
 block_size = 1024
 # model
-device = 'cuda:0'
-init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
-dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
 n_layer = 12
 n_head = 12
 n_embd = 768
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
 # adamw optimizer
 learning_rate = 6e-4 # max learning rate
-max_iters = 400000 # total number of training iterations
+max_iters = 600000 # total number of training iterations
 weight_decay = 1e-2
 betas = (0.9, 0.95)
 # learning rate decay settings
 decay_lr = True # whether to decay the learning rate
 warmup_iters = 2000 # how many steps to warm up for
-lr_decay_iters = 400000 # should be ~= max_iters per Chinchilla
+lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
 min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
 # DDP settings
 backend = 'nccl' # 'nccl', 'gloo', etc.
+# system
+device = 'cuda'
 compile = True # use PyTorch 2.0 to compile the model to be faster
 # -----------------------------------------------------------------------------
 exec(open('configurator.py').read()) # overrides from command line or config file
@ -181,7 +180,7 @@ def get_lr(iter):

 # logging
 if wandb_log and gpu_id == 0:
-    wandb.init(project=wandb_project, entity=wandb_entity, name=wandb_run_name)
+    wandb.init(project=wandb_project, name=wandb_run_name)
    wandb.config = {
        "batch_size": batch_size,
        "block_size": block_size,