1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-09-21 03:39:44 +00:00
nanogpt-experiments/find_unused_tokens.py

13 lines
334 B
Python
Raw Permalink Normal View History

2024-07-08 18:36:49 +00:00
import numpy as np
import os
2024-07-23 09:56:47 +00:00
from collections import Counter
2024-07-08 18:36:49 +00:00
data_dir = "."
data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
datas = set(data)
2024-07-23 09:56:47 +00:00
counts = Counter(data)
2024-07-08 18:36:49 +00:00
vocab = set(range(50257))
unused = vocab - datas
unused = sorted(unused)
print(len(unused))
2024-07-23 09:56:47 +00:00
print(unused)
print(counts.most_common(100))