mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
reverse the order, making sure that the final layer init is preserved, and becomes the token embedding instead of the other way around. otherwise the loss can be all messed up from a bad init
This commit is contained in:
parent
7c8288552b
commit
43b37fd568
2
model.py
2
model.py
@ -115,7 +115,7 @@ class GPT(nn.Module):
|
|||||||
ln_f = nn.LayerNorm(config.n_embd),
|
ln_f = nn.LayerNorm(config.n_embd),
|
||||||
))
|
))
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.lm_head.weight = self.transformer.wte.weight # https://paperswithcode.com/method/weight-tying
|
self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
|
||||||
|
|
||||||
# report number of parameters
|
# report number of parameters
|
||||||
n_params = sum(p.numel() for p in self.parameters())
|
n_params = sum(p.numel() for p in self.parameters())
|
||||||
|
Loading…
Reference in New Issue
Block a user