mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
13 lines
334 B
Python
13 lines
334 B
Python
import numpy as np
|
|
import os
|
|
from collections import Counter
|
|
data_dir = "."
|
|
data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
|
|
datas = set(data)
|
|
counts = Counter(data)
|
|
vocab = set(range(50257))
|
|
unused = vocab - datas
|
|
unused = sorted(unused)
|
|
print(len(unused))
|
|
print(unused)
|
|
print(counts.most_common(100)) |