nanogpt-experiments/image_model_test.py

import torch
from PIL import Image
import open_clip
import numpy as np

model_name = "ViT-SO400M-14-SigLIP-384"
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained="webli", precision="fp16", device="cuda")
model.eval()
tokenizer = open_clip.get_tokenizer(model_name)

print(model)

print("preprocess")
image = preprocess(Image.open("siglip.jpg")).unsqueeze(0).half().cuda()
image.requires_grad = True

print("fwd")
features = model.encode_image(image)
print("bwd")
s = features.abs().sum()
print(s.backward())
# Due to some nonsense, the model actually cuts off exactly six pixels from the right and bottom of the image.
# (6 = 384 - (14*27))
# Those can be varied arbitrarily without affecting the output, but that isn't interesting.
# B C W H, probably
real_grad = image.grad[:, :, :378, :378].abs()

x = torch.min(real_grad, dim=3)
print(x)
y = torch.min(x.values, dim=2)
print(y)
z = torch.min(y.values, dim=1)
print(z)

l_chan = z.indices[0]
l_x = y.indices[0][l_chan]
l_y = x.indices[0][l_chan][l_x]

least_affecting_index = 0, l_chan, l_x, l_y

image.requires_grad = False

print(real_grad[least_affecting_index], image[least_affecting_index])

avgmean = 0
avgmax = 0
n = 500
with torch.no_grad():
    for some_float in np.linspace(-1, 1, n):
        if -1 <= some_float <= 1:
            image[least_affecting_index] = float(some_float)
            altered_features = model.encode_image(image)
            mean_diff = (features - altered_features).abs().mean().item()
            max_diff = (features - altered_features).max().item()
            print(f"{some_float:0.3f}: {mean_diff:3f}, {max_diff:3f}")
            avgmean += mean_diff / n
            avgmax += max_diff / n

print(f"avg mean diff: {avgmean}, avg max diff: {avgmax}")
fix things 2024-07-23 09:56:47 +00:00			`import torch`
			`from PIL import Image`
			`import open_clip`
			`import numpy as np`

			`model_name = "ViT-SO400M-14-SigLIP-384"`
			`model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained="webli", precision="fp16", device="cuda")`
			`model.eval()`
			`tokenizer = open_clip.get_tokenizer(model_name)`

			`print(model)`

			`print("preprocess")`
			`image = preprocess(Image.open("siglip.jpg")).unsqueeze(0).half().cuda()`
			`image.requires_grad = True`

			`print("fwd")`
			`features = model.encode_image(image)`
			`print("bwd")`
			`s = features.abs().sum()`
			`print(s.backward())`
			`# Due to some nonsense, the model actually cuts off exactly six pixels from the right and bottom of the image.`
			`# (6 = 384 - (14*27))`
			`# Those can be varied arbitrarily without affecting the output, but that isn't interesting.`
			`# B C W H, probably`
			`real_grad = image.grad[:, :, :378, :378].abs()`

			`x = torch.min(real_grad, dim=3)`
			`print(x)`
			`y = torch.min(x.values, dim=2)`
			`print(y)`
			`z = torch.min(y.values, dim=1)`
			`print(z)`

			`l_chan = z.indices[0]`
			`l_x = y.indices[0][l_chan]`
			`l_y = x.indices[0][l_chan][l_x]`

			`least_affecting_index = 0, l_chan, l_x, l_y`

			`image.requires_grad = False`

			`print(real_grad[least_affecting_index], image[least_affecting_index])`

			`avgmean = 0`
			`avgmax = 0`
			`n = 500`
			`with torch.no_grad():`
			`for some_float in np.linspace(-1, 1, n):`
			`if -1 <= some_float <= 1:`
			`image[least_affecting_index] = float(some_float)`
			`altered_features = model.encode_image(image)`
			`mean_diff = (features - altered_features).abs().mean().item()`
			`max_diff = (features - altered_features).max().item()`
			`print(f"{some_float:0.3f}: {mean_diff:3f}, {max_diff:3f}")`
			`avgmean += mean_diff / n`
			`avgmax += max_diff / n`

			`print(f"avg mean diff: {avgmean}, avg max diff: {avgmax}")`