From 74bb1bc343e49fbab9567c7308f857235ee8653b Mon Sep 17 00:00:00 2001 From: osmarks Date: Fri, 27 Oct 2023 15:50:21 +0100 Subject: [PATCH] thumbnailer system --- clip_server.py | 10 +-- clipfront2/src/App.svelte | 12 +++- clipfront2/src/util.js | 16 ++++- formats.json | 1 + frontend_config.json | 3 +- mse.py | 1 + thumbnailer.py | 132 ++++++++++++++++++++++++++++++++++++++ thumbnailer_config.json | 5 ++ 8 files changed, 173 insertions(+), 7 deletions(-) create mode 100644 formats.json create mode 100644 thumbnailer.py create mode 100644 thumbnailer_config.json diff --git a/clip_server.py b/clip_server.py index 6a2ea7e..34fc7af 100644 --- a/clip_server.py +++ b/clip_server.py @@ -54,13 +54,16 @@ def do_inference(params: InferenceParameters): items_ctr.labels(MODELNAME, "text").inc(text.shape[0]) with inference_time_hist.labels(MODELNAME + "-text", text.shape[0]).time(): features = model.text_model.forward(input_ids=torch.tensor(text, device=DEVICE)).pooler_output + features /= features.norm(dim=-1, keepdim=True) + features = features.cpu().numpy() elif images is not None: items_ctr.labels(MODELNAME, "image").inc(images.shape[0]) with inference_time_hist.labels(MODELNAME + "-image", images.shape[0]).time(): features = model.vision_model.forward(torch.tensor(images, device=DEVICE)).pooler_output - features /= features.norm(dim=-1, keepdim=True) + features /= features.norm(dim=-1, keepdim=True) + features = features.cpu().numpy() batch_count_ctr.labels(MODELNAME).inc() - callback(True, features.cpu().numpy()) + callback(True, features) except Exception as e: traceback.print_exc() callback(False, str(e)) @@ -77,8 +80,7 @@ def preprocessing_thread(): try: if text: assert len(text) <= BS, f"max batch size is {BS}" - # I feel like this ought to be batchable but I can't see how to do that - text = numpy.array(tokenizer(text, padding="max_length", truncation=True)["input_ids"]) + text = numpy.array(tokenizer([ t.lower() for t in text ], padding="max_length", truncation=True)["input_ids"]) elif images: assert len(images) <= BS, f"max batch size is {BS}" images = numpy.array(image_processor([ Image.open(io.BytesIO(bs)) for bs in images ])["pixel_values"]).astype("float16") diff --git a/clipfront2/src/App.svelte b/clipfront2/src/App.svelte index 6782d50..f0e637a 100644 --- a/clipfront2/src/App.svelte +++ b/clipfront2/src/App.svelte @@ -112,7 +112,17 @@ {#each displayedResults as result} {#key result.file} -
{result.caption
+ {/key} {/each}
diff --git a/clipfront2/src/util.js b/clipfront2/src/util.js index 5568cf1..502916c 100644 --- a/clipfront2/src/util.js +++ b/clipfront2/src/util.js @@ -1,4 +1,5 @@ import * as config from "../../frontend_config.json" +import * as formats from "../../formats.json" export const getURL = x => config.image_path + x @@ -8,4 +9,17 @@ export const doQuery = args => fetch(config.backend_url, { "Content-Type": "application/json" }, body: JSON.stringify(args) -}).then(x => x.json()) \ No newline at end of file +}).then(x => x.json()) + +const filesafeCharset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-" +export const thumbnailPath = (originalPath, format) => { + const extension = formats.formats[format][0] + // Python and JS have minor differences in string handling wrt. astral characters which could result in incorrect quantities of dashes. Fortunately, Array.from handles this correctly. + return config.thumb_path + `${Array.from(originalPath).map(x => filesafeCharset.includes(x) ? x : "_").join("")}.${format}${extension}` +} + +const thumbedExtensions = formats.extensions +export const hasThumbnails = t => { + const parts = t.split(".") + return thumbedExtensions.includes("." + parts[parts.length - 1]) +} \ No newline at end of file diff --git a/formats.json b/formats.json new file mode 100644 index 0000000..af7117c --- /dev/null +++ b/formats.json @@ -0,0 +1 @@ +{"formats": {"avif-lq": [".avif", "image/avif"], "avif-hq": [".avif", "image/avif"], "jpeg-800": [".jpeg", "image/jpeg"], "jpeg-fullscale": [".jpeg", "image/jpeg"], "jpeg-256k": [".jpeg", "image/jpeg"]}, "extensions": [".jpeg", ".webp", ".png", ".jpg"]} \ No newline at end of file diff --git a/frontend_config.json b/frontend_config.json index a32f72c..2eb8b12 100644 --- a/frontend_config.json +++ b/frontend_config.json @@ -1,4 +1,5 @@ { "backend_url": "https://mse.osmarks.net/backend", - "image_path": "https://i2.osmarks.net/memes-or-something/" + "image_path": "https://i2.osmarks.net/memes-or-something/", + "thumb_path": "https://i2.osmarks.net/thumbs/memes-or-something_" } \ No newline at end of file diff --git a/mse.py b/mse.py index 4c60cb2..5594d5c 100644 --- a/mse.py +++ b/mse.py @@ -177,6 +177,7 @@ class Index: if filename not in filenames_set: new_data.append(numpy.frombuffer(embedding_vector, dtype="float16")) new_filenames.append(filename) + if not new_data: return new_data = numpy.array(new_data) self.associated_filenames.extend(new_filenames) self.faiss_index.add(new_data) diff --git a/thumbnailer.py b/thumbnailer.py new file mode 100644 index 0000000..7ae366f --- /dev/null +++ b/thumbnailer.py @@ -0,0 +1,132 @@ +import sqlite3 +import os +import hashlib +import json +import string +import subprocess +from PIL import Image +import tempfile +import shutil +import math +import sys + +with open(sys.argv[1], "r") as config_file: + CONFIG = json.load(config_file) + +filesafe_charset = string.ascii_letters + string.digits + "-" +def avif_format(quality): + avif_speed = "4" + def fn(inpath, outpath): + if os.path.splitext(inpath)[-1].lower() not in {".jpg", ".png", ".jpeg", ".avif"}: + with tempfile.NamedTemporaryFile() as tf: + subprocess.run(["convert", inpath, "png:" + tf.name]) + subprocess.run(["avifenc", "-s", avif_speed, "-j", "all", "-q", str(quality), tf.name, outpath], capture_output=True).check_returncode() + else: + subprocess.run(["avifenc", "-s", avif_speed, "-j", "all", "-q", str(quality), inpath, outpath], capture_output=True).check_returncode() + return fn + +def jpeg_format(quality=None, maxwidth=None, maxheight=None, target_size=None): + def do_convert(size, quality, input, output): + subprocess.run(["convert", input, "-resize", "x".join(map(str, size)), "-quality", str(quality), output]).check_returncode() + def fn(inpath, outpath): + im = Image.open(inpath) + width, height = im.size + if maxwidth and width > maxwidth: + height /= width / maxwidth + height = math.floor(height) + width = maxwidth + if maxheight and height > maxheight: + width /= height / maxheight + width = math.floor(width) + height = maxheight + if target_size is None: + do_convert((width, height), quality, inpath, outpath) + else: + q_min = 1 + q_max = 100 + while True: + with tempfile.NamedTemporaryFile() as tf: + test_quality = (q_min + q_max) // 2 + do_convert((width, height), test_quality, inpath, tf.name) + stat = os.stat(tf.name) + if stat.st_size >= target_size: + # too big + q_max = test_quality + else: + q_min = test_quality + 1 + if q_min >= q_max: + shutil.copy(tf.name, outpath) + break + + return fn + +input_path = CONFIG["input"] +output_path = CONFIG["output"] +exts = {".webp", ".png", ".jpg", ".jpeg"} +output_formats = { + "avif-lq": (avif_format(quality=30), ".avif", "image/avif"), + "avif-hq": (avif_format(quality=80), ".avif", "image/avif"), + "jpeg-800": (jpeg_format(maxwidth=800, quality=80), ".jpeg", "image/jpeg"), + "jpeg-fullscale": (jpeg_format(quality=80), ".jpeg", "image/jpeg"), + "jpeg-256k": (jpeg_format(target_size=256_000, maxwidth=600, maxheight=600), ".jpeg", "image/jpeg") +} + +with open("formats.json", "w") as f: + json.dump({ + "formats": { k: v[1:] for k, v in output_formats.items() }, + "extensions": list(exts) + }, f) + +if "gen-formats" in sys.argv: raise SystemExit + +con = sqlite3.connect(CONFIG["database"]) +con.executescript(""" +CREATE TABLE IF NOT EXISTS thumb ( + file TEXT PRIMARY KEY, + mtime REAL NOT NULL, + formats BLOB NOT NULL +); +""") +con.row_factory = sqlite3.Row + +out_formats_set = set(output_formats) +def generate_output_format_string(formats): + return json.dumps(sorted(formats)) +def to_outpath(input, format): + format_ext = output_formats[format][1] + return f"{''.join([ i if i in filesafe_charset else '_' for i in input ])}" + "." + format + format_ext +full_formats = generate_output_format_string(output_formats.keys()) + +for directory, subdirectories, files in os.walk(input_path): + directory = os.path.join(input_path, directory) + if directory.startswith(output_path): continue + for file in os.listdir(directory): + ext = os.path.splitext(file)[-1].lower() + if ext in exts: + path = os.path.join(directory, file) + rawname = path.removeprefix(input_path).removeprefix("/") + st = os.stat(path) + csr = con.execute("SELECT mtime, formats FROM thumb WHERE file = ?", (rawname,)) + row = csr.fetchone() + if not row: + mtime, formats = None, "[]" + else: + mtime, formats = row + if st.st_mtime != mtime or formats != full_formats: + formats = set(json.loads(formats)) + for new_format in out_formats_set - formats: + new_path = os.path.join(output_path, to_outpath(rawname, new_format)) + try: + output_formats[new_format][0](path, new_path) + except: + print("working on", new_format, rawname) + raise + nst = os.stat(new_path) + if nst.st_size > st.st_size: # bigger, so redundant + os.unlink(new_path) + os.symlink(os.path.relpath(path, output_path), new_path) + formats.add(new_format) + con.execute("INSERT OR REPLACE INTO thumb VALUES (?, ?, ?)", (rawname, st.st_mtime, generate_output_format_string(formats))) + con.commit() + sys.stdout.write(".") + sys.stdout.flush() \ No newline at end of file diff --git a/thumbnailer_config.json b/thumbnailer_config.json new file mode 100644 index 0000000..d476313 --- /dev/null +++ b/thumbnailer_config.json @@ -0,0 +1,5 @@ +{ + "database": "/srv/mse/thumb.sqlite3", + "input": "/data/public", + "output": "/data/public/thumbs" +} \ No newline at end of file