mirror of
https://github.com/osmarks/meme-search-engine.git
synced 2025-01-21 22:46:59 +00:00
thumbnailer system
This commit is contained in:
parent
5b5ef271aa
commit
74bb1bc343
@ -54,13 +54,16 @@ def do_inference(params: InferenceParameters):
|
||||
items_ctr.labels(MODELNAME, "text").inc(text.shape[0])
|
||||
with inference_time_hist.labels(MODELNAME + "-text", text.shape[0]).time():
|
||||
features = model.text_model.forward(input_ids=torch.tensor(text, device=DEVICE)).pooler_output
|
||||
features /= features.norm(dim=-1, keepdim=True)
|
||||
features = features.cpu().numpy()
|
||||
elif images is not None:
|
||||
items_ctr.labels(MODELNAME, "image").inc(images.shape[0])
|
||||
with inference_time_hist.labels(MODELNAME + "-image", images.shape[0]).time():
|
||||
features = model.vision_model.forward(torch.tensor(images, device=DEVICE)).pooler_output
|
||||
features /= features.norm(dim=-1, keepdim=True)
|
||||
features /= features.norm(dim=-1, keepdim=True)
|
||||
features = features.cpu().numpy()
|
||||
batch_count_ctr.labels(MODELNAME).inc()
|
||||
callback(True, features.cpu().numpy())
|
||||
callback(True, features)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
callback(False, str(e))
|
||||
@ -77,8 +80,7 @@ def preprocessing_thread():
|
||||
try:
|
||||
if text:
|
||||
assert len(text) <= BS, f"max batch size is {BS}"
|
||||
# I feel like this ought to be batchable but I can't see how to do that
|
||||
text = numpy.array(tokenizer(text, padding="max_length", truncation=True)["input_ids"])
|
||||
text = numpy.array(tokenizer([ t.lower() for t in text ], padding="max_length", truncation=True)["input_ids"])
|
||||
elif images:
|
||||
assert len(images) <= BS, f"max batch size is {BS}"
|
||||
images = numpy.array(image_processor([ Image.open(io.BytesIO(bs)) for bs in images ])["pixel_values"]).astype("float16")
|
||||
|
@ -112,7 +112,17 @@
|
||||
<Masonry bind:refreshLayout={refreshLayout} colWidth="minmax(Min(20em, 100%), 1fr)" items={displayedResults}>
|
||||
{#each displayedResults as result}
|
||||
{#key result.file}
|
||||
<div class="result"><a href={util.getURL(result.file)}><img src={util.getURL(result.file)} on:load={updateCounter} on:error={updateCounter} alt={result.caption || result.file}></a></div>
|
||||
<div class="result">
|
||||
<a href={util.getURL(result.file)}>
|
||||
<picture>
|
||||
{#if util.hasThumbnails(result.file)}
|
||||
<source srcset={util.thumbnailPath(result.file, "avif-lq") + ", " + util.thumbnailPath(result.file, "avif-hq") + " 2x"} type="image/avif" />
|
||||
<source srcset={util.thumbnailPath(result.file, "jpeg-800") + " 800w, " + util.thumbnailPath(result.file, "jpeg-fullscale")} type="image/jpeg" />
|
||||
{/if}
|
||||
<img src={util.getURL(result.file)} on:load={updateCounter} on:error={updateCounter} alt={result.caption || result.file}>
|
||||
</picture>
|
||||
</a>
|
||||
</div>
|
||||
{/key}
|
||||
{/each}
|
||||
</Masonry>
|
||||
|
@ -1,4 +1,5 @@
|
||||
import * as config from "../../frontend_config.json"
|
||||
import * as formats from "../../formats.json"
|
||||
|
||||
export const getURL = x => config.image_path + x
|
||||
|
||||
@ -8,4 +9,17 @@ export const doQuery = args => fetch(config.backend_url, {
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
body: JSON.stringify(args)
|
||||
}).then(x => x.json())
|
||||
}).then(x => x.json())
|
||||
|
||||
const filesafeCharset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-"
|
||||
export const thumbnailPath = (originalPath, format) => {
|
||||
const extension = formats.formats[format][0]
|
||||
// Python and JS have minor differences in string handling wrt. astral characters which could result in incorrect quantities of dashes. Fortunately, Array.from handles this correctly.
|
||||
return config.thumb_path + `${Array.from(originalPath).map(x => filesafeCharset.includes(x) ? x : "_").join("")}.${format}${extension}`
|
||||
}
|
||||
|
||||
const thumbedExtensions = formats.extensions
|
||||
export const hasThumbnails = t => {
|
||||
const parts = t.split(".")
|
||||
return thumbedExtensions.includes("." + parts[parts.length - 1])
|
||||
}
|
1
formats.json
Normal file
1
formats.json
Normal file
@ -0,0 +1 @@
|
||||
{"formats": {"avif-lq": [".avif", "image/avif"], "avif-hq": [".avif", "image/avif"], "jpeg-800": [".jpeg", "image/jpeg"], "jpeg-fullscale": [".jpeg", "image/jpeg"], "jpeg-256k": [".jpeg", "image/jpeg"]}, "extensions": [".jpeg", ".webp", ".png", ".jpg"]}
|
@ -1,4 +1,5 @@
|
||||
{
|
||||
"backend_url": "https://mse.osmarks.net/backend",
|
||||
"image_path": "https://i2.osmarks.net/memes-or-something/"
|
||||
"image_path": "https://i2.osmarks.net/memes-or-something/",
|
||||
"thumb_path": "https://i2.osmarks.net/thumbs/memes-or-something_"
|
||||
}
|
1
mse.py
1
mse.py
@ -177,6 +177,7 @@ class Index:
|
||||
if filename not in filenames_set:
|
||||
new_data.append(numpy.frombuffer(embedding_vector, dtype="float16"))
|
||||
new_filenames.append(filename)
|
||||
if not new_data: return
|
||||
new_data = numpy.array(new_data)
|
||||
self.associated_filenames.extend(new_filenames)
|
||||
self.faiss_index.add(new_data)
|
||||
|
132
thumbnailer.py
Normal file
132
thumbnailer.py
Normal file
@ -0,0 +1,132 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
import string
|
||||
import subprocess
|
||||
from PIL import Image
|
||||
import tempfile
|
||||
import shutil
|
||||
import math
|
||||
import sys
|
||||
|
||||
with open(sys.argv[1], "r") as config_file:
|
||||
CONFIG = json.load(config_file)
|
||||
|
||||
filesafe_charset = string.ascii_letters + string.digits + "-"
|
||||
def avif_format(quality):
|
||||
avif_speed = "4"
|
||||
def fn(inpath, outpath):
|
||||
if os.path.splitext(inpath)[-1].lower() not in {".jpg", ".png", ".jpeg", ".avif"}:
|
||||
with tempfile.NamedTemporaryFile() as tf:
|
||||
subprocess.run(["convert", inpath, "png:" + tf.name])
|
||||
subprocess.run(["avifenc", "-s", avif_speed, "-j", "all", "-q", str(quality), tf.name, outpath], capture_output=True).check_returncode()
|
||||
else:
|
||||
subprocess.run(["avifenc", "-s", avif_speed, "-j", "all", "-q", str(quality), inpath, outpath], capture_output=True).check_returncode()
|
||||
return fn
|
||||
|
||||
def jpeg_format(quality=None, maxwidth=None, maxheight=None, target_size=None):
|
||||
def do_convert(size, quality, input, output):
|
||||
subprocess.run(["convert", input, "-resize", "x".join(map(str, size)), "-quality", str(quality), output]).check_returncode()
|
||||
def fn(inpath, outpath):
|
||||
im = Image.open(inpath)
|
||||
width, height = im.size
|
||||
if maxwidth and width > maxwidth:
|
||||
height /= width / maxwidth
|
||||
height = math.floor(height)
|
||||
width = maxwidth
|
||||
if maxheight and height > maxheight:
|
||||
width /= height / maxheight
|
||||
width = math.floor(width)
|
||||
height = maxheight
|
||||
if target_size is None:
|
||||
do_convert((width, height), quality, inpath, outpath)
|
||||
else:
|
||||
q_min = 1
|
||||
q_max = 100
|
||||
while True:
|
||||
with tempfile.NamedTemporaryFile() as tf:
|
||||
test_quality = (q_min + q_max) // 2
|
||||
do_convert((width, height), test_quality, inpath, tf.name)
|
||||
stat = os.stat(tf.name)
|
||||
if stat.st_size >= target_size:
|
||||
# too big
|
||||
q_max = test_quality
|
||||
else:
|
||||
q_min = test_quality + 1
|
||||
if q_min >= q_max:
|
||||
shutil.copy(tf.name, outpath)
|
||||
break
|
||||
|
||||
return fn
|
||||
|
||||
input_path = CONFIG["input"]
|
||||
output_path = CONFIG["output"]
|
||||
exts = {".webp", ".png", ".jpg", ".jpeg"}
|
||||
output_formats = {
|
||||
"avif-lq": (avif_format(quality=30), ".avif", "image/avif"),
|
||||
"avif-hq": (avif_format(quality=80), ".avif", "image/avif"),
|
||||
"jpeg-800": (jpeg_format(maxwidth=800, quality=80), ".jpeg", "image/jpeg"),
|
||||
"jpeg-fullscale": (jpeg_format(quality=80), ".jpeg", "image/jpeg"),
|
||||
"jpeg-256k": (jpeg_format(target_size=256_000, maxwidth=600, maxheight=600), ".jpeg", "image/jpeg")
|
||||
}
|
||||
|
||||
with open("formats.json", "w") as f:
|
||||
json.dump({
|
||||
"formats": { k: v[1:] for k, v in output_formats.items() },
|
||||
"extensions": list(exts)
|
||||
}, f)
|
||||
|
||||
if "gen-formats" in sys.argv: raise SystemExit
|
||||
|
||||
con = sqlite3.connect(CONFIG["database"])
|
||||
con.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS thumb (
|
||||
file TEXT PRIMARY KEY,
|
||||
mtime REAL NOT NULL,
|
||||
formats BLOB NOT NULL
|
||||
);
|
||||
""")
|
||||
con.row_factory = sqlite3.Row
|
||||
|
||||
out_formats_set = set(output_formats)
|
||||
def generate_output_format_string(formats):
|
||||
return json.dumps(sorted(formats))
|
||||
def to_outpath(input, format):
|
||||
format_ext = output_formats[format][1]
|
||||
return f"{''.join([ i if i in filesafe_charset else '_' for i in input ])}" + "." + format + format_ext
|
||||
full_formats = generate_output_format_string(output_formats.keys())
|
||||
|
||||
for directory, subdirectories, files in os.walk(input_path):
|
||||
directory = os.path.join(input_path, directory)
|
||||
if directory.startswith(output_path): continue
|
||||
for file in os.listdir(directory):
|
||||
ext = os.path.splitext(file)[-1].lower()
|
||||
if ext in exts:
|
||||
path = os.path.join(directory, file)
|
||||
rawname = path.removeprefix(input_path).removeprefix("/")
|
||||
st = os.stat(path)
|
||||
csr = con.execute("SELECT mtime, formats FROM thumb WHERE file = ?", (rawname,))
|
||||
row = csr.fetchone()
|
||||
if not row:
|
||||
mtime, formats = None, "[]"
|
||||
else:
|
||||
mtime, formats = row
|
||||
if st.st_mtime != mtime or formats != full_formats:
|
||||
formats = set(json.loads(formats))
|
||||
for new_format in out_formats_set - formats:
|
||||
new_path = os.path.join(output_path, to_outpath(rawname, new_format))
|
||||
try:
|
||||
output_formats[new_format][0](path, new_path)
|
||||
except:
|
||||
print("working on", new_format, rawname)
|
||||
raise
|
||||
nst = os.stat(new_path)
|
||||
if nst.st_size > st.st_size: # bigger, so redundant
|
||||
os.unlink(new_path)
|
||||
os.symlink(os.path.relpath(path, output_path), new_path)
|
||||
formats.add(new_format)
|
||||
con.execute("INSERT OR REPLACE INTO thumb VALUES (?, ?, ?)", (rawname, st.st_mtime, generate_output_format_string(formats)))
|
||||
con.commit()
|
||||
sys.stdout.write(".")
|
||||
sys.stdout.flush()
|
5
thumbnailer_config.json
Normal file
5
thumbnailer_config.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"database": "/srv/mse/thumb.sqlite3",
|
||||
"input": "/data/public",
|
||||
"output": "/data/public/thumbs"
|
||||
}
|
Loading…
Reference in New Issue
Block a user