mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
np.sum overflows on windows
This commit is contained in:
parent
7fe4a099ad
commit
6649b299eb
@ -50,7 +50,7 @@ tokenized = split_dataset.map(
|
|||||||
|
|
||||||
# concatenate all the ids in each dataset into one large file we can use for training
|
# concatenate all the ids in each dataset into one large file we can use for training
|
||||||
for split, dset in tokenized.items():
|
for split, dset in tokenized.items():
|
||||||
arr_len = np.sum(dset['len'])
|
arr_len = np.sum(dset['len'], dtype=np.uint64)
|
||||||
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
|
filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
|
||||||
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
|
||||||
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
|
||||||
|
Loading…
Reference in New Issue
Block a user