1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00

reverse the order, making sure that the final layer init is preserved, and becomes the token embedding instead of the other way around. otherwise the loss can be all messed up from a bad init

This commit is contained in:
Andrej Karpathy 2023-01-14 02:16:10 +00:00
parent 7c8288552b
commit 43b37fd568

View File

@ -115,7 +115,7 @@ class GPT(nn.Module):
ln_f = nn.LayerNorm(config.n_embd),
))
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.lm_head.weight = self.transformer.wte.weight # https://paperswithcode.com/method/weight-tying
self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
# report number of parameters
n_params = sum(p.numel() for p in self.parameters())