1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00
nanogpt-experiments/image_model_test.py

59 lines
1.7 KiB
Python
Raw Normal View History

2024-07-23 09:56:47 +00:00
import torch
from PIL import Image
import open_clip
import numpy as np
model_name = "ViT-SO400M-14-SigLIP-384"
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained="webli", precision="fp16", device="cuda")
model.eval()
tokenizer = open_clip.get_tokenizer(model_name)
print(model)
print("preprocess")
image = preprocess(Image.open("siglip.jpg")).unsqueeze(0).half().cuda()
image.requires_grad = True
print("fwd")
features = model.encode_image(image)
print("bwd")
s = features.abs().sum()
print(s.backward())
# Due to some nonsense, the model actually cuts off exactly six pixels from the right and bottom of the image.
# (6 = 384 - (14*27))
# Those can be varied arbitrarily without affecting the output, but that isn't interesting.
# B C W H, probably
real_grad = image.grad[:, :, :378, :378].abs()
x = torch.min(real_grad, dim=3)
print(x)
y = torch.min(x.values, dim=2)
print(y)
z = torch.min(y.values, dim=1)
print(z)
l_chan = z.indices[0]
l_x = y.indices[0][l_chan]
l_y = x.indices[0][l_chan][l_x]
least_affecting_index = 0, l_chan, l_x, l_y
image.requires_grad = False
print(real_grad[least_affecting_index], image[least_affecting_index])
avgmean = 0
avgmax = 0
n = 500
with torch.no_grad():
for some_float in np.linspace(-1, 1, n):
if -1 <= some_float <= 1:
image[least_affecting_index] = float(some_float)
altered_features = model.encode_image(image)
mean_diff = (features - altered_features).abs().mean().item()
max_diff = (features - altered_features).max().item()
print(f"{some_float:0.3f}: {mean_diff:3f}, {max_diff:3f}")
avgmean += mean_diff / n
avgmax += max_diff / n
print(f"avg mean diff: {avgmean}, avg max diff: {avgmax}")