diff --git a/sample.py b/sample.py index a6cbbd9..5ccbbcf 100644 --- a/sample.py +++ b/sample.py @@ -8,7 +8,7 @@ from model import GPTConfig, GPT # ----------------------------------------------------------------------------- out_dir = 'out' -device = 'cuda:2' +device = 'cuda' compile = False start = "\n" # or "<|endoftext|>" or whatever you like num_samples = 10 # number of samples to draw diff --git a/train.py b/train.py index 2fbb231..ed346db 100644 --- a/train.py +++ b/train.py @@ -10,7 +10,6 @@ $ torchrun --standalone --nproc_per_node=4 train.py """ import os -import sys import time import math @@ -31,9 +30,9 @@ log_interval = 1 eval_iters = 200 eval_only = False # if True, script exits right after the first eval always_save_checkpoint = True # if True, always save a checkpoint after each eval +init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' # wandb logging wandb_log = False # disabled by default -wandb_entity = 'karpathy' wandb_project = 'owt' wandb_run_name = 'gpt2' # 'run' + str(time.time()) # data @@ -41,24 +40,24 @@ dataset = 'openwebtext' batch_size = 12 block_size = 1024 # model -device = 'cuda:0' -init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' -dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ n_layer = 12 n_head = 12 n_embd = 768 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ # adamw optimizer learning_rate = 6e-4 # max learning rate -max_iters = 400000 # total number of training iterations +max_iters = 600000 # total number of training iterations weight_decay = 1e-2 betas = (0.9, 0.95) # learning rate decay settings decay_lr = True # whether to decay the learning rate warmup_iters = 2000 # how many steps to warm up for -lr_decay_iters = 400000 # should be ~= max_iters per Chinchilla +lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla # DDP settings backend = 'nccl' # 'nccl', 'gloo', etc. +# system +device = 'cuda' compile = True # use PyTorch 2.0 to compile the model to be faster # ----------------------------------------------------------------------------- exec(open('configurator.py').read()) # overrides from command line or config file @@ -181,7 +180,7 @@ def get_lr(iter): # logging if wandb_log and gpu_id == 0: - wandb.init(project=wandb_project, entity=wandb_entity, name=wandb_run_name) + wandb.init(project=wandb_project, name=wandb_run_name) wandb.config = { "batch_size": batch_size, "block_size": block_size,