add support for character-level language models, a new character-level shakespeare dataset, a new config file that shows how to train a character-level baby GPT on it, and adjust the sample function to figure out if it should decode with characters or GPT2 bpe tokens. The current implementation is a bit hacky and basically assumes just these two possibilities. In the future we may want to support more general encoders or decoders.

2025-10-25 04:17:40 +00:00 · 2023-01-11 05:27:19 +00:00
parent c2a402f7f7
commit d17350a31d
5 changed files with 137 additions and 4 deletions
--- a/config/train_shakespeare_char.py
+++ b/config/train_shakespeare_char.py
@@ -0,0 +1,36 @@
+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+
+out_dir = 'out-shakespeare-char'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = True
+
+wandb_log = False # override via command line if you like
+wandb_project = 'shakespeare-char'
+wandb_run_name = 'mini-gpt'
+
+dataset = 'shakespeare_char'
+batch_size = 64
+block_size = 128 # context of up to 128 previous characters
+
+# baby GPT model :)
+n_layer = 4
+n_head = 4
+n_embd = 128
+dropout = 0.0
+
+learning_rate = 1e-3 # with baby networks can afford to go a bit higher
+max_iters = 5000
+lr_decay_iters = 5000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+# compile = False # do not torch compile the model