thumbnailer system

2024-09-21 01:59:37 +00:00 · 2023-10-27 15:50:21 +01:00 · 2023-10-27 15:50:21 +01:00 · 74bb1bc343
commit 74bb1bc343
parent 5b5ef271aa
8 changed files with 173 additions and 7 deletions
--- a/clip_server.py
+++ b/clip_server.py
@ -54,13 +54,16 @@ def do_inference(params: InferenceParameters):
                items_ctr.labels(MODELNAME, "text").inc(text.shape[0])
                with inference_time_hist.labels(MODELNAME + "-text", text.shape[0]).time():
                    features = model.text_model.forward(input_ids=torch.tensor(text, device=DEVICE)).pooler_output
+                    features /= features.norm(dim=-1, keepdim=True)
+                    features = features.cpu().numpy()
            elif images is not None:
                items_ctr.labels(MODELNAME, "image").inc(images.shape[0])
                with inference_time_hist.labels(MODELNAME + "-image", images.shape[0]).time():
                    features = model.vision_model.forward(torch.tensor(images, device=DEVICE)).pooler_output
                    features /= features.norm(dim=-1, keepdim=True)
+                    features = features.cpu().numpy()
            batch_count_ctr.labels(MODELNAME).inc()
-            callback(True, features.cpu().numpy())
+            callback(True, features)
        except Exception as e:
            traceback.print_exc()
            callback(False, str(e))
@ -77,8 +80,7 @@ def preprocessing_thread():
        try:
            if text:
                assert len(text) <= BS, f"max batch size is {BS}"
-                # I feel like this ought to be batchable but I can't see how to do that
-                text = numpy.array(tokenizer(text, padding="max_length", truncation=True)["input_ids"])
+                text = numpy.array(tokenizer([ t.lower() for t in text ], padding="max_length", truncation=True)["input_ids"])
            elif images:
                assert len(images) <= BS, f"max batch size is {BS}"
                images = numpy.array(image_processor([ Image.open(io.BytesIO(bs)) for bs in images ])["pixel_values"]).astype("float16")
--- a/clipfront2/src/App.svelte
+++ b/clipfront2/src/App.svelte
@ -112,7 +112,17 @@
    <Masonry bind:refreshLayout={refreshLayout} colWidth="minmax(Min(20em, 100%), 1fr)" items={displayedResults}>
        {#each displayedResults as result}
            {#key result.file}
-                <div class="result"><a href={util.getURL(result.file)}><img src={util.getURL(result.file)} on:load={updateCounter} on:error={updateCounter} alt={result.caption || result.file}></a></div>
+                <div class="result">
+                    <a href={util.getURL(result.file)}>
+                        <picture>
+                            {#if util.hasThumbnails(result.file)}
+                                <source srcset={util.thumbnailPath(result.file, "avif-lq") + ", " + util.thumbnailPath(result.file, "avif-hq") + " 2x"} type="image/avif" />
+                                <source srcset={util.thumbnailPath(result.file, "jpeg-800") + " 800w, " + util.thumbnailPath(result.file, "jpeg-fullscale")} type="image/jpeg" />
+                            {/if}
+                            <img src={util.getURL(result.file)} on:load={updateCounter} on:error={updateCounter} alt={result.caption || result.file}>
+                        </picture>
+                    </a>
+                </div>
            {/key}
        {/each}
    </Masonry>
--- a/clipfront2/src/util.js
+++ b/clipfront2/src/util.js
@ -1,4 +1,5 @@
 import * as config from "../../frontend_config.json"
+import * as formats from "../../formats.json"

 export const getURL = x => config.image_path + x

@ -9,3 +10,16 @@ export const doQuery = args => fetch(config.backend_url, {
    },
    body: JSON.stringify(args)
 }).then(x => x.json())
+
+const filesafeCharset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-"
+export const thumbnailPath = (originalPath, format) => {
+    const extension = formats.formats[format][0]
+    // Python and JS have minor differences in string handling wrt. astral characters which could result in incorrect quantities of dashes. Fortunately, Array.from handles this correctly.
+    return config.thumb_path + `${Array.from(originalPath).map(x => filesafeCharset.includes(x) ? x : "_").join("")}.${format}${extension}`
+}
+
+const thumbedExtensions = formats.extensions
+export const hasThumbnails = t => {
+    const parts = t.split(".")
+    return thumbedExtensions.includes("." + parts[parts.length - 1])
+}
--- a/formats.json
+++ b/formats.json
@ -0,0 +1 @@
+{"formats": {"avif-lq": [".avif", "image/avif"], "avif-hq": [".avif", "image/avif"], "jpeg-800": [".jpeg", "image/jpeg"], "jpeg-fullscale": [".jpeg", "image/jpeg"], "jpeg-256k": [".jpeg", "image/jpeg"]}, "extensions": [".jpeg", ".webp", ".png", ".jpg"]}
--- a/frontend_config.json
+++ b/frontend_config.json
@ -1,4 +1,5 @@
 {
    "backend_url": "https://mse.osmarks.net/backend",
-    "image_path": "https://i2.osmarks.net/memes-or-something/"
+    "image_path": "https://i2.osmarks.net/memes-or-something/",
+    "thumb_path": "https://i2.osmarks.net/thumbs/memes-or-something_"
 }
--- a/mse.py
+++ b/mse.py
@ -177,6 +177,7 @@ class Index:
                            if filename not in filenames_set:
                                new_data.append(numpy.frombuffer(embedding_vector, dtype="float16"))
                                new_filenames.append(filename)
+                    if not new_data: return
                    new_data = numpy.array(new_data)
                    self.associated_filenames.extend(new_filenames)
                    self.faiss_index.add(new_data)
--- a/thumbnailer.py
+++ b/thumbnailer.py
@ -0,0 +1,132 @@
+import sqlite3
+import os
+import hashlib
+import json
+import string
+import subprocess
+from PIL import Image
+import tempfile
+import shutil
+import math
+import sys
+
+with open(sys.argv[1], "r") as config_file:
+    CONFIG = json.load(config_file)
+
+filesafe_charset = string.ascii_letters + string.digits + "-"
+def avif_format(quality):
+    avif_speed = "4"
+    def fn(inpath, outpath):
+        if os.path.splitext(inpath)[-1].lower() not in {".jpg", ".png", ".jpeg", ".avif"}:
+            with tempfile.NamedTemporaryFile() as tf:
+                subprocess.run(["convert", inpath, "png:" + tf.name])
+                subprocess.run(["avifenc", "-s", avif_speed, "-j", "all", "-q", str(quality), tf.name, outpath], capture_output=True).check_returncode()
+        else:
+            subprocess.run(["avifenc", "-s", avif_speed, "-j", "all", "-q", str(quality), inpath, outpath], capture_output=True).check_returncode()
+    return fn
+
+def jpeg_format(quality=None, maxwidth=None, maxheight=None, target_size=None):
+    def do_convert(size, quality, input, output):
+        subprocess.run(["convert", input, "-resize", "x".join(map(str, size)), "-quality", str(quality), output]).check_returncode()
+    def fn(inpath, outpath):
+        im = Image.open(inpath)
+        width, height = im.size
+        if maxwidth and width > maxwidth:
+            height /= width / maxwidth
+            height = math.floor(height)
+            width = maxwidth
+        if maxheight and height > maxheight:
+            width /= height / maxheight
+            width = math.floor(width)
+            height = maxheight
+        if target_size is None:
+            do_convert((width, height), quality, inpath, outpath)
+        else:
+            q_min = 1
+            q_max = 100
+            while True:
+                with tempfile.NamedTemporaryFile() as tf:
+                    test_quality = (q_min + q_max) // 2
+                    do_convert((width, height), test_quality, inpath, tf.name)
+                    stat = os.stat(tf.name)
+                    if stat.st_size >= target_size:
+                        # too big
+                        q_max = test_quality
+                    else:
+                        q_min = test_quality + 1
+                    if q_min >= q_max:
+                        shutil.copy(tf.name, outpath)
+                        break
+        
+    return fn
+
+input_path = CONFIG["input"]
+output_path = CONFIG["output"]
+exts = {".webp", ".png", ".jpg", ".jpeg"}
+output_formats = {
+    "avif-lq": (avif_format(quality=30), ".avif", "image/avif"),
+    "avif-hq": (avif_format(quality=80), ".avif", "image/avif"),
+    "jpeg-800": (jpeg_format(maxwidth=800, quality=80), ".jpeg", "image/jpeg"),
+    "jpeg-fullscale": (jpeg_format(quality=80), ".jpeg", "image/jpeg"),
+    "jpeg-256k": (jpeg_format(target_size=256_000, maxwidth=600, maxheight=600), ".jpeg", "image/jpeg")
+}
+
+with open("formats.json", "w") as f:
+    json.dump({
+        "formats": { k: v[1:] for k, v in output_formats.items() },
+        "extensions": list(exts)
+    }, f)
+
+if "gen-formats" in sys.argv: raise SystemExit
+
+con = sqlite3.connect(CONFIG["database"])
+con.executescript("""
+CREATE TABLE IF NOT EXISTS thumb (
+    file TEXT PRIMARY KEY,
+    mtime REAL NOT NULL,
+    formats BLOB NOT NULL
+);
+""")
+con.row_factory = sqlite3.Row
+
+out_formats_set = set(output_formats)
+def generate_output_format_string(formats):
+    return json.dumps(sorted(formats))
+def to_outpath(input, format):
+    format_ext = output_formats[format][1]
+    return f"{''.join([ i if i in filesafe_charset else '_' for i in input ])}" + "." + format + format_ext
+full_formats = generate_output_format_string(output_formats.keys())
+
+for directory, subdirectories, files in os.walk(input_path):
+    directory = os.path.join(input_path, directory)
+    if directory.startswith(output_path): continue
+    for file in os.listdir(directory):
+        ext = os.path.splitext(file)[-1].lower()
+        if ext in exts:
+            path = os.path.join(directory, file)
+            rawname = path.removeprefix(input_path).removeprefix("/")
+            st = os.stat(path)
+            csr = con.execute("SELECT mtime, formats FROM thumb WHERE file = ?", (rawname,))
+            row = csr.fetchone()
+            if not row:
+                mtime, formats = None, "[]"
+            else:
+                mtime, formats = row
+            if st.st_mtime != mtime or formats != full_formats:
+                formats = set(json.loads(formats))
+                for new_format in out_formats_set - formats:
+                    new_path = os.path.join(output_path, to_outpath(rawname, new_format))
+                    try:
+                        output_formats[new_format][0](path, new_path)
+                    except:
+                        print("working on", new_format, rawname)
+                        raise
+                    nst = os.stat(new_path)
+                    if nst.st_size > st.st_size: # bigger, so redundant
+                        os.unlink(new_path)
+                        os.symlink(os.path.relpath(path, output_path), new_path)
+                    formats.add(new_format)
+                con.execute("INSERT OR REPLACE INTO thumb VALUES (?, ?, ?)", (rawname, st.st_mtime, generate_output_format_string(formats)))
+                con.commit()
+                sys.stdout.write(".")
+                sys.stdout.flush()
--- a/thumbnailer_config.json
+++ b/thumbnailer_config.json
@ -0,0 +1,5 @@
+{
+    "database": "/srv/mse/thumb.sqlite3",
+    "input": "/data/public",
+    "output": "/data/public/thumbs"
+}
				`@ -0,0 +1 @@`
				`{"formats": {"avif-lq": [".avif", "image/avif"], "avif-hq": [".avif", "image/avif"], "jpeg-800": [".jpeg", "image/jpeg"], "jpeg-fullscale": [".jpeg", "image/jpeg"], "jpeg-256k": [".jpeg", "image/jpeg"]}, "extensions": [".jpeg", ".webp", ".png", ".jpg"]}`