meme-search-engine/meme-rater/compute_cdf.py

import torch.nn
import torch.nn.functional as F
import torch
import sqlite3
import random
import numpy
import json
import time
from tqdm import tqdm
import sys
from collections import defaultdict
import msgpack

from model import Config, BradleyTerry
import shared

def fetch_files_with_timestamps():
    csr = shared.db.execute("SELECT filename, embedding, timestamp FROM files WHERE embedding IS NOT NULL")
    x = [ (row[0], numpy.frombuffer(row[1], dtype="float16").copy(), row[2]) for row in csr.fetchall() ]
    csr.close()
    return x

batch_size = 2048
device = "cuda"

config = Config(
    d_emb=1152,
    n_hidden=1,
    n_ensemble=16,
    device=device,
    dtype=torch.float32,
    output_channels=3,
    dropout=0.1
)
model = BradleyTerry(config)
model.eval()
modelc, _ = shared.checkpoint_for(int(sys.argv[1]))
model.load_state_dict(torch.load(modelc))
params = sum(p.numel() for p in model.parameters())
print(f"{params/1e6:.1f}M parameters")
print(model)

for x in model.ensemble.models:
    x.output.bias.data.fill_(0) # hack to match behaviour of cut-down implementation

results = defaultdict(list)
model.eval()

files = fetch_files_with_timestamps()

with torch.inference_mode():
    for bstart in tqdm(range(0, len(files), batch_size)):
        batch = files[bstart:bstart + batch_size]
        timestamps = [ t1 for f1, e1, t1 in batch ]
        embs = torch.stack([ torch.Tensor(e1).to(config.dtype) for f1, e1, t1 in batch ])
        inputs = embs.unsqueeze(0).expand((config.n_ensemble, len(batch), config.d_emb)).to(device)
        scores = model.ensemble(inputs).mean(dim=0).cpu().numpy()
        for sr in scores:
            for i, s in enumerate(sr):
                results[i].append(s)
        # add an extra timestamp channel
        results[config.output_channels].extend(timestamps)

cdfs = []
# we want to encode scores in one byte, and 255/0xFF is reserved for "greater than maximum bucket"
cdf_bins = 255
for i, s in results.items():
    quantiles = numpy.linspace(0, 1, cdf_bins)
    cdf = numpy.quantile(numpy.array(s), quantiles)
    print(cdf)
    cdfs.append(cdf.tolist())

with open("cdfs.msgpack", "wb") as f:
    msgpack.pack(cdfs, f)