1
0
mirror of https://github.com/osmarks/meme-search-engine.git synced 2024-11-10 22:09:54 +00:00

preliminary meme rater work

This commit is contained in:
osmarks 2024-04-20 16:55:11 +01:00
parent e9a7493343
commit 0b0261f625
4 changed files with 244 additions and 1 deletions

10
.gitignore vendored
View File

@ -1,2 +1,10 @@
*/node_modules
*/static/app.*
*/static/app.*
# Added by cargo
/target
meme-rater/images/
meme-rater/meta/
meme-rater/*.sqlite3*
meme-rater/deploy_for_training.sh

118
meme-rater/crawler.py Normal file
View File

@ -0,0 +1,118 @@
import aiohttp
import asyncio
import aiofiles
import os.path
import hashlib
import json
import time
import sys
async def fetch_list_seg(sess, list_url, query):
async with sess.get(list_url + ".json", params=query) as res:
return await res.json()
async def fetch_past(sess, list_url, n):
after = None
count = 0
while count < n:
args = { "count": 25 }
if after is not None: args["after"] = after
chunk = await fetch_list_seg(sess, list_url, args)
if "data" not in chunk:
print("\n", chunk)
await asyncio.sleep(400)
continue
new_items = chunk["data"]["children"]
yield [ i["data"] for i in new_items ]
count += len(new_items)
print("\nup to", count)
after = new_items[-1]["data"]["name"]
SEEN_ITEMS_SIZE = 200
async def fetch_stream(sess, list_url):
# dicts are ordered, so this is a very janky ordered set implementation
seen = {}
while True:
list_items = (await fetch_list_seg(sess, list_url, {}))["data"]["children"]
new = [ i["data"] for i in list_items if i["data"]["name"] not in seen ]
# yield the new items
for n in new: yield n
# add new items to list of seen things
seen.update(dict.fromkeys(i["name"] for i in new))
# remove old seen items until it's a reasonable size
while len(seen) > SEEN_ITEMS_SIZE: seen.pop(next(iter(seen.keys())))
# compute average time between posts and wait that long for next fetch cycle
times = [ i["data"]["created"] for i in list_items ]
timediffs = list(map(lambda x: x[0] - x[1], zip(times, times[1:])))
average = sum(timediffs) / len(timediffs)
await asyncio.sleep(average)
def bucket(id): return hashlib.md5(id.encode("utf-8")).hexdigest()[:2]
filetypes = {
"image/png": "png",
"image/jpeg": "jpg",
"image/webp": "webp",
"image/avif": "avif"
}
CHUNK_SIZE = 1<<18 # entirely arbitrary
async def download(sess, url, file):
async with sess.get(url) as res:
ctype = res.headers.get("content-type")
if ctype not in filetypes: return
if int(res.headers.get("content-length", 1e9)) > 8e6: return
async with aiofiles.open(file + "." + filetypes[ctype], mode="wb") as fh:
while chunk := await res.content.read(CHUNK_SIZE):
await fh.write(chunk)
return dict(res.headers)
if __name__ == "__main__":
async def main():
sem = asyncio.Semaphore(16)
async with aiohttp.ClientSession() as sess:
async def download_item(item):
#print("starting on", item["name"])
print(".", end="")
sys.stdout.flush()
if item["over_18"] or not item["is_robot_indexable"]: return
id = item["name"]
bck = bucket(id)
os.makedirs(os.path.join("images", bck), exist_ok=True)
os.makedirs(os.path.join("meta", bck), exist_ok=True)
if not item["url"].startswith("https://"): return
meta_path = os.path.join("meta", bck, id + ".json")
if not os.path.exists(meta_path): # sorry
print("|", end="")
sys.stdout.flush()
try:
result = await download(sess, item["url"], os.path.join("images", bck, id))
except Exception as e:
print("\nMeme acquisition failure:", e)
return
if result:
item["headers"] = result
with open(meta_path, "w") as fh:
json.dump(item, fh)
else:
print("!", end="")
sys.stdout.flush()
#print("done on", id)
async def dl_task(item):
async with sem:
try:
await asyncio.wait_for(download_item(item), timeout=30)
except asyncio.TimeoutError: pass
async for items in fetch_past(sess, "https://www.reddit.com/user/osmarks/m/memeharvesting/new", 20000):
#print("got new chunk")
await sem.acquire()
sem.release()
#print("downloading new set")
async with asyncio.TaskGroup() as tg:
for item in items:
tg.create_task(dl_task(item))
asyncio.run(main())

View File

@ -0,0 +1,6 @@
{
"clip_server": "http://100.64.0.10:1708/",
"db_path": "data.sqlite3",
"port": 1707,
"files": "./images"
}

111
meme-rater/rater_server.py Normal file
View File

@ -0,0 +1,111 @@
from aiohttp import web
import aiosqlite
import asyncio
import random
import sys
PORT, DATABASE = sys.argv[1:]
app = web.Application(client_max_size=32*1024**2)
routes = web.RouteTableDef()
async def get_pair(db):
while True:
filenames = [ x[0] for x in await db.execute_fetchall("SELECT filename FROM files", ()) ]
m1, m2 = tuple(sorted(random.sample(filenames, 2)))
csr = await db.execute("SELECT 1 FROM ratings WHERE meme1 = ? AND meme2 = ?", (m1, m2))
if not await csr.fetchone():
return m1, m2
@routes.get("/")
async def index(request):
meme1, meme2 = await get_pair(request.app["db"])
return web.Response(text=f"""
<!DOCTYPE html>
<html>
<style>
.memes img {{
width: 45%;
}}
@media (max-width: 768px) {{
.memes img {{
width: 100%;
}}
}}
.memes {{
margin-top: 2em;
}}
</style>
<body>
<h1>Meme Rating</h1>
<form action="/rate" method="POST">
<input type="radio" name="rating" value="1" id="rating1"> <label for="rating1">Meme 1 is better</label>
<input type="radio" name="rating" value="2" id="rating2"> <label for="rating2">Meme 2 is better</label>
<input type="hidden" name="meme1" value="{meme1}">
<input type="hidden" name="meme2" value="{meme2}">
<input type="submit" value="Submit">
<div class="memes">
<img src="/memes/{meme1}" id="meme1">
<img src="/memes/{meme2}" id="meme2">
</div>
</form>
<script>
document.addEventListener("keypress", function(event) {{
if (event.key === "1") {{
document.querySelector("input[name='rating'][value='1']").checked = true
document.querySelector("form").submit()
}} else if (event.key === "2") {{
document.querySelector("input[name='rating'][value='2']").checked = true
document.querySelector("form").submit()
}}
}});
document.querySelector("#meme1").addEventListener("click", function(event) {{
document.querySelector("input[name='rating'][value='1']").checked = true
document.querySelector("form").submit()
}})
document.querySelector("#meme2").addEventListener("click", function(event) {{
document.querySelector("input[name='rating'][value='2']").checked = true
document.querySelector("form").submit()
}})
</script>
</body>
</html>
""", content_type="text/html")
@routes.post("/rate")
async def rate(request):
db = request.app["db"]
post = await request.post()
meme1 = post["meme1"]
meme2 = post["meme2"]
rating = post["rating"]
await db.execute("INSERT INTO ratings (meme1, meme2, rating) VALUES (?, ?, ?)", (meme1, meme2, rating))
await db.commit()
return web.HTTPFound("/")
async def main():
app["db"] = await aiosqlite.connect(DATABASE)
await app["db"].executescript("""
CREATE TABLE IF NOT EXISTS ratings (
meme1 TEXT NOT NULL,
meme2 TEXT NOT NULL,
rating TEXT NOT NULL,
UNIQUE (meme1, meme2)
);
""")
app.router.add_routes(routes)
app.router.add_static("/memes/", "./images")
print("Ready")
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, "", int(PORT))
await site.start()
if __name__ == "__main__":
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(main())
loop.run_forever()