commit 9d89e6e4f5c7f110c8dabdb5df40731912fea44c Author: osmarks Date: Thu Sep 28 17:30:20 2023 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9364963 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*/node_modules +*/static/app.* \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..09526de --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright © 2023 osmarks + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8027ff6 --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +# Meme Search Engine + +Do you have a large folder of memes you want to search semantically? Do you have a Linux server with an Nvidia GPU? You do; this is now mandatory. + +## Features + +They say a picture is worth a thousand words. Unfortunately, not all (most?) sets of words can be adequately described by pictures. Regardless, here is a picture. You can use a running instance [here](https://mse.osmarks.net/). + +![Meme Search Engine's frontend.](/demo-image.png) + +* Infinite-scroll masonry UI for dense meme viewing. +* Online reindexing (a good reason to use it over [clip-retrieval](https://github.com/rom1504/clip-retrieval)) - reload memes without a slow expensive rebuild step. +* Complex query support - query using text and images, including multiple terms at once, with weighting (including negative). +* Reasonably fast. + +## Setup + +* Serve your meme library from a static webserver. + * I use nginx. If you're in a hurry, you can use `python -m http.server`. +* Install Python dependencies with `pip` from `requirements.txt` (the versions probably shouldn't need to match exactly if you need to change them; I just put in what I currently have installed). +* Run `clip_server.py` (as a background service). + * It is configured with a JSON file given to it as its first argument. An example is in `clip_server_config.json`. + * `device` should probably be `cuda` or `cpu`. The model will run on here. + * `model` is the [OpenCLIP](https://github.com/mlfoundations/open_clip) model to use. + * `model_name` is the name of the model for metrics purposes. + * `max_batch_size` controls the maximum allowed batch size. Higher values generally result in somewhat better performance (the bottleneck in most cases is elsewhere right now though) at the cost of higher VRAM use. + * `port` is the port to run the HTTP server on. +* Run `mse.py` (also as a background service) + * This needs to be exposed somewhere the frontend can reach it. Configure your reverse proxy appropriately. + * It has a JSON config file as well. + * `clip_server` is the full URL for the backend server. + * `db_path` is the path for the SQLite database of images and embedding vectors. + * `files` is where meme files will be read from. Subdirectories are indexed. + * `port` is the port to serve HTTP on. +* Build clipfront2, host on your favourite static webserver. + * `npm install`, `node src/build.js`. + * You will need to rebuild it whenever you edit `frontend_config.json`. + * `image_path` is the base URL of your meme webserver (with trailing slash). + * `backend_url` is the URL `mse.py` is exposed on (trailing slash probably optional). +* If you want, configure Prometheus to monitor `mse.py` and `clip_server.py`. + +## Scaling + +Meme Search Engine uses an in-memory FAISS index to hold its embedding vectors, because I was lazy and it works fine (~100MB total RAM used for my 8000 memes). If you want to store significantly more than that you will have to switch to a more efficient/compact index (see [here](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index)). As vector indices are held exclusively in memory, you will need to either persist them to disk or use ones which are fast to build/remove from/add to (presumably PCA/PQ indices). At some point if you increase total traffic the CLIP model may also become a bottleneck, as I also have no batching strategy. Indexing appears to actually be CPU-bound (specifically, it's limited by single-threaded image decoding and serialization) - improving that would require a lot of redesigns so I haven't. You may also want to scale down displayed memes to cut bandwidth needs. \ No newline at end of file diff --git a/clip_server.py b/clip_server.py new file mode 100644 index 0000000..7145805 --- /dev/null +++ b/clip_server.py @@ -0,0 +1,142 @@ +import torch +import time +import threading +from aiohttp import web +import aiohttp +import asyncio +import traceback +import umsgpack +import collections +import queue +import open_clip +from PIL import Image +from prometheus_client import Counter, Histogram, REGISTRY, generate_latest +import io +import json +import sys + +with open(sys.argv[1], "r") as config_file: + CONFIG = json.load(config_file) + +device = torch.device(CONFIG["device"]) +model, _, preprocess = open_clip.create_model_and_transforms(CONFIG["model"], device=device, pretrained=dict(open_clip.list_pretrained())[CONFIG["model"]], precision="fp16") +model.eval() +tokenizer = open_clip.get_tokenizer(CONFIG["model"]) +print("Model loaded") + +BS = CONFIG["max_batch_size"] +MODELNAME = CONFIG["model_name"] + +InferenceParameters = collections.namedtuple("InferenceParameters", ["text", "images", "callback"]) + +items_ctr = Counter("modelserver_total_items", "Items run through model server", ["model", "modality"]) +inference_time_hist = Histogram("modelserver_inftime", "Time running inference", ["model", "batch_size"]) +batch_count_ctr = Counter("modelserver_batchcount", "Inference batches run", ["model"]) + +torch.set_grad_enabled(False) +def do_inference(params: InferenceParameters): + with torch.no_grad(): + try: + text, images, callback = params + if text is not None: + items_ctr.labels(MODELNAME, "text").inc(text.shape[0]) + with inference_time_hist.labels(MODELNAME + "-text", text.shape[0]).time(): + features = model.encode_text(text) + elif images is not None: + with inference_time_hist.labels(MODELNAME + "-image", images.shape[0]).time(): + items_ctr.labels(MODELNAME, "image").inc(images.shape[0]) + features = model.encode_image(images) + batch_count_ctr.labels(MODELNAME).inc() + features /= features.norm(dim=-1, keepdim=True) + callback(True, features.cpu().numpy()) + except Exception as e: + traceback.print_exc() + callback(False, str(e)) + finally: + torch.cuda.empty_cache() + +iq = queue.Queue(10) +def infer_thread(): + while True: + do_inference(iq.get()) + +pq = queue.Queue(10) +def preprocessing_thread(): + while True: + text, images, callback = pq.get() + try: + if text: + assert len(text) <= BS, f"max batch size is {BS}" + text = tokenizer(text).to(device) + elif images: + assert len(images) <= BS, f"max batch size is {BS}" + images = torch.stack([ preprocess(Image.open(io.BytesIO(im))).half() for im in images ]).to(device) + else: + assert False, "images or text required" + iq.put(InferenceParameters(text, images, callback)) + except Exception as e: + traceback.print_exc() + callback(False, str(e)) + +app = web.Application(client_max_size=2**26) +routes = web.RouteTableDef() + +@routes.post("/") +async def run_inference(request): + loop = asyncio.get_event_loop() + data = umsgpack.loads(await request.read()) + event = asyncio.Event() + results = None + def callback(*argv): + loop.call_soon_threadsafe(lambda: event.set()) + nonlocal results + results = argv + pq.put_nowait(InferenceParameters(data.get("text"), data.get("images"), callback)) + await event.wait() + body_data = results[1] + if results[0]: + status = 200 + body_data = [x.astype("float16").tobytes() for x in body_data] + else: + status = 500 + print(results[1]) + return web.Response(body=umsgpack.dumps(body_data), status=status, content_type="application/msgpack") + +@routes.get("/config") +async def config(request): + return web.Response(body=umsgpack.dumps({ + "model": CONFIG["model"], + "batch": BS, + "image_size": model.visual.image_size, + "embedding_size": model.visual.output_dim + }), status=200, content_type="application/msgpack") + +@routes.get("/") +async def health(request): + return web.Response(status=204) + +@routes.get("/metrics") +async def metrics(request): + return web.Response(body=generate_latest(REGISTRY)) + +app.router.add_routes(routes) + +async def run_webserver(): + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, "", CONFIG["port"]) + print("Ready") + await site.start() + +try: + th = threading.Thread(target=infer_thread) + th.start() + th = threading.Thread(target=preprocessing_thread) + th.start() + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(run_webserver()) + loop.run_forever() +except KeyboardInterrupt: + import sys + sys.exit(0) \ No newline at end of file diff --git a/clip_server_config.json b/clip_server_config.json new file mode 100644 index 0000000..251d312 --- /dev/null +++ b/clip_server_config.json @@ -0,0 +1,7 @@ +{ + "device": "cuda:0", + "model": "ViT-H-14", + "model_name": "openclip-ViT-H-14", + "max_batch_size": 128, + "port": 1708 +} \ No newline at end of file diff --git a/clipfront2/package-lock.json b/clipfront2/package-lock.json new file mode 100644 index 0000000..14a9597 --- /dev/null +++ b/clipfront2/package-lock.json @@ -0,0 +1,463 @@ +{ + "name": "clipfront2", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "devDependencies": { + "esbuild": "^0.12.15", + "esbuild-svelte": "^0.5.3", + "sass": "^1.68.0", + "svelte-preprocess-sass": "^2.0.1" + } + }, + "node_modules/anymatch": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.2.tgz", + "integrity": "sha512-P43ePfOAIupkguHUycrc4qJ9kz8ZiuOUijaETwX7THt0Y/GNK7v0aa8rY816xWjZ7rJdA5XdMcpVFTKMq+RvWg==", + "dev": true, + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/binary-extensions": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz", + "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/braces": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", + "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "dev": true, + "dependencies": { + "fill-range": "^7.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.2.tgz", + "integrity": "sha512-ekGhOnNVPgT77r4K/U3GDhu+FQ2S8TnK/s2KbIGXi0SZWuwkZ2QNyfWdZW+TVfn84DpEP7rLeCt2UI6bJ8GwbQ==", + "dev": true, + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/esbuild": { + "version": "0.12.15", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.12.15.tgz", + "integrity": "sha512-72V4JNd2+48eOVCXx49xoSWHgC3/cCy96e7mbXKY+WOWghN00cCmlGnwVLRhRHorvv0dgCyuMYBZlM2xDM5OQw==", + "dev": true, + "hasInstallScript": true, + "bin": { + "esbuild": "bin/esbuild" + } + }, + "node_modules/esbuild-svelte": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/esbuild-svelte/-/esbuild-svelte-0.5.3.tgz", + "integrity": "sha512-KByKD/yt8QaqKjLu32MG3MXBExJYlDM0QwzW3pzKLJR4eev0923DrUKRHPBBjB+OVirUtZnEJE/qitjdW/WyAw==", + "dev": true, + "dependencies": { + "svelte": "^3.38.3" + }, + "peerDependencies": { + "esbuild": ">=0.9.6" + } + }, + "node_modules/fill-range": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", + "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "dev": true, + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/immutable": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-4.0.0.tgz", + "integrity": "sha512-zIE9hX70qew5qTUjSS7wi1iwj/l7+m54KWU247nhM3v806UdGj1yDndXj+IOYxxtW9zyLI+xqFNZjTuDaLUqFw==", + "dev": true + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-glob": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.1.tgz", + "integrity": "sha512-5G0tKtBTFImOqDnLB2hG6Bp2qcKEFduo4tZu9MT/H6NQv/ghhy30o55ufafxJ/LdH79LLs2Kfrn85TLKyA7BUg==", + "dev": true, + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "dev": true, + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/picomatch": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.0.tgz", + "integrity": "sha512-lY1Q/PiJGC2zOv/z391WOTD+Z02bCgsFfvxoXXf6h7kv9o+WmsmzYqrAwY63sNgOxE4xEdq0WyUnXfKeBrSvYw==", + "dev": true, + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/sass": { + "version": "1.68.0", + "resolved": "https://registry.npmjs.org/sass/-/sass-1.68.0.tgz", + "integrity": "sha512-Lmj9lM/fef0nQswm1J2HJcEsBUba4wgNx2fea6yJHODREoMFnwRpZydBnX/RjyXw2REIwdkbqE4hrTo4qfDBUA==", + "dev": true, + "dependencies": { + "chokidar": ">=3.0.0 <4.0.0", + "immutable": "^4.0.0", + "source-map-js": ">=0.6.2 <2.0.0" + }, + "bin": { + "sass": "sass.js" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/source-map-js": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz", + "integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/svelte": { + "version": "3.38.3", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-3.38.3.tgz", + "integrity": "sha512-N7bBZJH0iF24wsalFZF+fVYMUOigaAUQMIcEKHO3jstK/iL8VmP9xE+P0/a76+FkNcWt+TDv2Gx1taUoUscrvw==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/svelte-preprocess-filter": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/svelte-preprocess-filter/-/svelte-preprocess-filter-1.0.0.tgz", + "integrity": "sha512-92innv59nyEx24xbfcSurB5ocwC8qFdDtGli/JVMHzJsxyvV2yjQKIcbUqU9VIV5mKUWO2PoY93nncS2yF4ULQ==", + "dev": true + }, + "node_modules/svelte-preprocess-sass": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/svelte-preprocess-sass/-/svelte-preprocess-sass-2.0.1.tgz", + "integrity": "sha512-0y4FjRsRWcN7rJeNJnSfZ7LVAz6S7/j9Dg24XFRelr/rjMMjXORdEvXy4r38fUYmyk9Y7yjwlHCiqyGxMHhEbg==", + "dev": true, + "dependencies": { + "svelte-preprocess-filter": "^1.0.0" + }, + "peerDependencies": { + "sass": "^1.35.2" + } + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "dev": true, + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + } + }, + "dependencies": { + "anymatch": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.2.tgz", + "integrity": "sha512-P43ePfOAIupkguHUycrc4qJ9kz8ZiuOUijaETwX7THt0Y/GNK7v0aa8rY816xWjZ7rJdA5XdMcpVFTKMq+RvWg==", + "dev": true, + "requires": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + } + }, + "binary-extensions": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz", + "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==", + "dev": true + }, + "braces": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", + "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "dev": true, + "requires": { + "fill-range": "^7.0.1" + } + }, + "chokidar": { + "version": "3.5.2", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.2.tgz", + "integrity": "sha512-ekGhOnNVPgT77r4K/U3GDhu+FQ2S8TnK/s2KbIGXi0SZWuwkZ2QNyfWdZW+TVfn84DpEP7rLeCt2UI6bJ8GwbQ==", + "dev": true, + "requires": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "fsevents": "~2.3.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + } + }, + "esbuild": { + "version": "0.12.15", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.12.15.tgz", + "integrity": "sha512-72V4JNd2+48eOVCXx49xoSWHgC3/cCy96e7mbXKY+WOWghN00cCmlGnwVLRhRHorvv0dgCyuMYBZlM2xDM5OQw==", + "dev": true + }, + "esbuild-svelte": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/esbuild-svelte/-/esbuild-svelte-0.5.3.tgz", + "integrity": "sha512-KByKD/yt8QaqKjLu32MG3MXBExJYlDM0QwzW3pzKLJR4eev0923DrUKRHPBBjB+OVirUtZnEJE/qitjdW/WyAw==", + "dev": true, + "requires": { + "svelte": "^3.38.3" + } + }, + "fill-range": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", + "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "dev": true, + "requires": { + "to-regex-range": "^5.0.1" + } + }, + "fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "optional": true + }, + "glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "requires": { + "is-glob": "^4.0.1" + } + }, + "immutable": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-4.0.0.tgz", + "integrity": "sha512-zIE9hX70qew5qTUjSS7wi1iwj/l7+m54KWU247nhM3v806UdGj1yDndXj+IOYxxtW9zyLI+xqFNZjTuDaLUqFw==", + "dev": true + }, + "is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "requires": { + "binary-extensions": "^2.0.0" + } + }, + "is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", + "dev": true + }, + "is-glob": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.1.tgz", + "integrity": "sha512-5G0tKtBTFImOqDnLB2hG6Bp2qcKEFduo4tZu9MT/H6NQv/ghhy30o55ufafxJ/LdH79LLs2Kfrn85TLKyA7BUg==", + "dev": true, + "requires": { + "is-extglob": "^2.1.1" + } + }, + "is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "dev": true + }, + "normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true + }, + "picomatch": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.0.tgz", + "integrity": "sha512-lY1Q/PiJGC2zOv/z391WOTD+Z02bCgsFfvxoXXf6h7kv9o+WmsmzYqrAwY63sNgOxE4xEdq0WyUnXfKeBrSvYw==", + "dev": true + }, + "readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "requires": { + "picomatch": "^2.2.1" + } + }, + "sass": { + "version": "1.68.0", + "resolved": "https://registry.npmjs.org/sass/-/sass-1.68.0.tgz", + "integrity": "sha512-Lmj9lM/fef0nQswm1J2HJcEsBUba4wgNx2fea6yJHODREoMFnwRpZydBnX/RjyXw2REIwdkbqE4hrTo4qfDBUA==", + "dev": true, + "requires": { + "chokidar": ">=3.0.0 <4.0.0", + "immutable": "^4.0.0", + "source-map-js": ">=0.6.2 <2.0.0" + } + }, + "source-map-js": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz", + "integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==", + "dev": true + }, + "svelte": { + "version": "3.38.3", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-3.38.3.tgz", + "integrity": "sha512-N7bBZJH0iF24wsalFZF+fVYMUOigaAUQMIcEKHO3jstK/iL8VmP9xE+P0/a76+FkNcWt+TDv2Gx1taUoUscrvw==", + "dev": true + }, + "svelte-preprocess-filter": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/svelte-preprocess-filter/-/svelte-preprocess-filter-1.0.0.tgz", + "integrity": "sha512-92innv59nyEx24xbfcSurB5ocwC8qFdDtGli/JVMHzJsxyvV2yjQKIcbUqU9VIV5mKUWO2PoY93nncS2yF4ULQ==", + "dev": true + }, + "svelte-preprocess-sass": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/svelte-preprocess-sass/-/svelte-preprocess-sass-2.0.1.tgz", + "integrity": "sha512-0y4FjRsRWcN7rJeNJnSfZ7LVAz6S7/j9Dg24XFRelr/rjMMjXORdEvXy4r38fUYmyk9Y7yjwlHCiqyGxMHhEbg==", + "dev": true, + "requires": { + "svelte-preprocess-filter": "^1.0.0" + } + }, + "to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "dev": true, + "requires": { + "is-number": "^7.0.0" + } + } + } +} diff --git a/clipfront2/package.json b/clipfront2/package.json new file mode 100644 index 0000000..52255ed --- /dev/null +++ b/clipfront2/package.json @@ -0,0 +1,8 @@ +{ + "devDependencies": { + "esbuild": "^0.12.15", + "esbuild-svelte": "^0.5.3", + "sass": "^1.68.0", + "svelte-preprocess-sass": "^2.0.1" + } +} diff --git a/clipfront2/src/App.svelte b/clipfront2/src/App.svelte new file mode 100644 index 0000000..1a1d7e1 --- /dev/null +++ b/clipfront2/src/App.svelte @@ -0,0 +1,217 @@ + + +

Meme Search Engine

+
+ +
+ + + +
+
+ +{#if error} +
{error}
+{/if} +{#if resultPromise} + +{/if} +{#if results} + + {#each displayedResults as result} + {#key result.file} +
{result.caption
+ {/key} + {/each} +
+{/if} + + + + diff --git a/clipfront2/src/Loading.svelte b/clipfront2/src/Loading.svelte new file mode 100644 index 0000000..8fefc06 --- /dev/null +++ b/clipfront2/src/Loading.svelte @@ -0,0 +1,36 @@ + + + + +{operation} \ No newline at end of file diff --git a/clipfront2/src/Masonry.svelte b/clipfront2/src/Masonry.svelte new file mode 100644 index 0000000..c3a55b8 --- /dev/null +++ b/clipfront2/src/Masonry.svelte @@ -0,0 +1,119 @@ + + + + +
+ +
+ + + + + + \ No newline at end of file diff --git a/clipfront2/src/app.js b/clipfront2/src/app.js new file mode 100644 index 0000000..cab2363 --- /dev/null +++ b/clipfront2/src/app.js @@ -0,0 +1,5 @@ +import App from "./App.svelte" + +new App({ + target: document.body, +}) \ No newline at end of file diff --git a/clipfront2/src/build.js b/clipfront2/src/build.js new file mode 100644 index 0000000..6037466 --- /dev/null +++ b/clipfront2/src/build.js @@ -0,0 +1,25 @@ +const esbuild = require("esbuild") +const sveltePlugin = require("esbuild-svelte") +const path = require("path") +const { sass } = require("svelte-preprocess-sass") + +esbuild + .build({ + entryPoints: [path.join(__dirname, "app.js")], + bundle: true, + minify: true, + outfile: path.join(__dirname, "../static/app.js"), + plugins: [sveltePlugin({ + preprocess: { + style: sass() + } + })], + loader: { + ".woff": "file", + ".woff2": "file", + ".ttf": "file" + }, + logLevel: "info", + watch: process.argv.join(" ").includes("watch") + }) + .catch(() => process.exit(1)) diff --git a/clipfront2/src/util.js b/clipfront2/src/util.js new file mode 100644 index 0000000..5568cf1 --- /dev/null +++ b/clipfront2/src/util.js @@ -0,0 +1,11 @@ +import * as config from "../../frontend_config.json" + +export const getURL = x => config.image_path + x + +export const doQuery = args => fetch(config.backend_url, { + method: "POST", + headers: { + "Content-Type": "application/json" + }, + body: JSON.stringify(args) +}).then(x => x.json()) \ No newline at end of file diff --git a/clipfront2/static/index.html b/clipfront2/static/index.html new file mode 100644 index 0000000..f86ca16 --- /dev/null +++ b/clipfront2/static/index.html @@ -0,0 +1,15 @@ + + + + + + + + Meme Search Engine + + + + + + + diff --git a/demo-image.png b/demo-image.png new file mode 100644 index 0000000..5a64dad Binary files /dev/null and b/demo-image.png differ diff --git a/frontend_config.json b/frontend_config.json new file mode 100644 index 0000000..a32f72c --- /dev/null +++ b/frontend_config.json @@ -0,0 +1,4 @@ +{ + "backend_url": "https://mse.osmarks.net/backend", + "image_path": "https://i2.osmarks.net/memes-or-something/" +} \ No newline at end of file diff --git a/mse.py b/mse.py new file mode 100644 index 0000000..600ab65 --- /dev/null +++ b/mse.py @@ -0,0 +1,199 @@ +from aiohttp import web +import aiohttp +import asyncio +import traceback +import umsgpack +from PIL import Image +import base64 +import aiosqlite +import faiss +import numpy +import os +import aiohttp_cors +import json +import sys + +with open(sys.argv[1], "r") as config_file: + CONFIG = json.load(config_file) + +app = web.Application(client_max_size=32*1024**2) +routes = web.RouteTableDef() + +async def clip_server(query, unpack_buffer=True): + async with aiohttp.ClientSession() as sess: + async with sess.post(CONFIG["clip_server"], data=umsgpack.dumps(query)) as res: + response = umsgpack.loads(await res.read()) + if res.status == 200: + if unpack_buffer: + response = [ numpy.frombuffer(x, dtype="float16") for x in response ] + return response + else: + raise Exception(response if res.headers.get("content-type") == "application/msgpack" else (await res.text())) + +@routes.post("/") +async def run_query(request): + data = await request.json() + embeddings = [] + if images := data.get("images", []): + embeddings.extend(await clip_server({ "images": [ base64.b64decode(x) for x, w in images ] })) + if text := data.get("text", []): + embeddings.extend(await clip_server({ "text": [ x for x, w in text ] })) + weights = [ w for x, w in images ] + [ w for x, w in text ] + embeddings = [ e * w for e, w in zip(embeddings, weights) ] + if not embeddings: + return web.json_response([]) + return web.json_response(app["index"].search(sum(embeddings))) + +@routes.get("/") +async def health_check(request): + return web.Response(text="OK") + +@routes.post("/reload_index") +async def reload_index_route(request): + await request.app["index"].reload() + return web.json_response(True) + +class Index: + def __init__(self, inference_server_config): + self.faiss_index = faiss.IndexFlatIP(inference_server_config["embedding_size"]) + self.associated_filenames = [] + self.inference_server_config = inference_server_config + self.lock = asyncio.Lock() + + def search(self, query): + distances, indices = self.faiss_index.search(numpy.array([query]), 4000) + distances = distances[0] + indices = indices[0] + try: + indices = indices[:numpy.where(indices==-1)[0][0]] + except IndexError: pass + return [ { "score": float(distance), "file": self.associated_filenames[index] } for index, distance in zip(indices, distances) ] + + async def reload(self): + async with self.lock: + print("Indexing") + conn = await aiosqlite.connect(CONFIG["db_path"], parent_loop=asyncio.get_running_loop()) + conn.row_factory = aiosqlite.Row + await conn.executescript(""" + CREATE TABLE IF NOT EXISTS files ( + filename TEXT PRIMARY KEY, + modtime REAL NOT NULL, + embedding_vector BLOB NOT NULL + ); + """) + try: + async with asyncio.TaskGroup() as tg: + batch_sem = asyncio.Semaphore(3) + + modified = set() + + async def do_batch(batch): + try: + query = { "images": [ arg[2] for arg in batch ] } + embeddings = await clip_server(query, False) + await conn.executemany("INSERT OR REPLACE INTO files VALUES (?, ?, ?)", [ + (filename, modtime, embedding) for (filename, modtime, _), embedding in zip(batch, embeddings) + ]) + await conn.commit() + for filename, _, _ in batch: + modified.add(filename) + sys.stdout.write(".") + finally: + batch_sem.release() + + async def dispatch_batch(batch): + await batch_sem.acquire() + tg.create_task(do_batch(batch)) + + files = {} + for filename, modtime in await conn.execute_fetchall("SELECT filename, modtime FROM files"): + files[filename] = modtime + await conn.commit() + batch = [] + + for dirpath, _, filenames in os.walk(CONFIG["files"]): + for file in filenames: + path = os.path.join(dirpath, file) + file = os.path.relpath(path, CONFIG["files"]) + st = os.stat(path) + if st.st_mtime != files.get(file): + try: + im = Image.open(path).resize(self.inference_server_config["image_size"]).convert("RGB") + buf = io.BytesIO() + im.save(buf, format="BMP") + b = buf.getvalue() + except Exception as e: + print(file, "failed", e) + continue + batch.append((file, st.st_mtime, b)) + if len(batch) % self.inference_server_config["batch"] == self.inference_server_config["batch"] - 1: + await dispatch_batch(batch) + batch = [] + if batch: + await dispatch_batch(batch) + + remove_indices = [] + for index, filename in enumerate(self.associated_filenames): + if filename not in files or filename in modified: + remove_indices.append(index) + self.associated_filenames[index] = None + if filename not in files: + await conn.execute("DELETE FROM files WHERE filename = ?", (filename,)) + await conn.commit() + # TODO concurrency + # TODO understand what that comment meant + if remove_indices: + self.faiss_index.remove_ids(numpy.array(remove_indices)) + self.associated_filenames = [ x for x in self.associated_filenames if x is not None ] + + filenames_set = set(self.associated_filenames) + new_data = [] + new_filenames = [] + async with conn.execute("SELECT * FROM files") as csr: + while row := await csr.fetchone(): + filename, modtime, embedding_vector = row + if filename not in filenames_set: + new_data.append(numpy.frombuffer(embedding_vector, dtype="float16")) + new_filenames.append(filename) + new_data = numpy.array(new_data) + self.associated_filenames.extend(new_filenames) + self.faiss_index.add(new_data) + finally: + await conn.close() + +app.router.add_routes(routes) + +cors = aiohttp_cors.setup(app, defaults={ + "*": aiohttp_cors.ResourceOptions( + allow_credentials=False, + expose_headers="*", + allow_headers="*", + ) +}) +for route in list(app.router.routes()): + cors.add(route) + +async def main(): + while True: + async with aiohttp.ClientSession() as sess: + try: + async with await sess.get(CONFIG["clip_server"] + "config") as res: + inference_server_config = umsgpack.unpackb(await res.read()) + print("Backend config:", inference_server_config) + break + except: + traceback.print_exc() + await asyncio.sleep(1) + index = Index(inference_server_config) + app["index"] = index + await index.reload() + print("Ready") + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, "", CONFIG["port"]) + await site.start() + +loop = asyncio.new_event_loop() +asyncio.set_event_loop(loop) +loop.run_until_complete(main()) +loop.run_forever() \ No newline at end of file diff --git a/mse_config.json b/mse_config.json new file mode 100644 index 0000000..254da66 --- /dev/null +++ b/mse_config.json @@ -0,0 +1,6 @@ +{ + "clip_server": "http://localhost:1708/", + "db_path": "/srv/mse/data.sqlite3", + "port": 1707, + "files": "/data/public/memes-or-something/" +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f0e2e73 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +open_clip_torch==2.20.0 +Pillow==10.0.1 +prometheus-client==0.17.1 +u-msgpack-python==2.8.0 +aiohttp==3.8.5 +aiohttp-cors==0.7.0 +faiss-cpu==1.7.4 +aiosqlite==0.19.0 \ No newline at end of file