mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
Merge pull request #220 from python273/patch-1
Fix GPT.crop_block_size when flash attention is available
This commit is contained in:
commit
ea24604b29
3
model.py
3
model.py
@ -207,7 +207,8 @@ class GPT(nn.Module):
|
|||||||
self.config.block_size = block_size
|
self.config.block_size = block_size
|
||||||
self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
|
self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
|
||||||
for block in self.transformer.h:
|
for block in self.transformer.h:
|
||||||
block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
|
if hasattr(block.attn, 'bias'):
|
||||||
|
block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, model_type, override_args=None):
|
def from_pretrained(cls, model_type, override_args=None):
|
||||||
|
Loading…
Reference in New Issue
Block a user