mirror of
https://github.com/osmarks/meme-search-engine.git
synced 2025-01-21 22:46:59 +00:00
preliminary meme rater work
This commit is contained in:
parent
e9a7493343
commit
0b0261f625
10
.gitignore
vendored
10
.gitignore
vendored
@ -1,2 +1,10 @@
|
||||
*/node_modules
|
||||
*/static/app.*
|
||||
*/static/app.*
|
||||
|
||||
# Added by cargo
|
||||
|
||||
/target
|
||||
meme-rater/images/
|
||||
meme-rater/meta/
|
||||
meme-rater/*.sqlite3*
|
||||
meme-rater/deploy_for_training.sh
|
118
meme-rater/crawler.py
Normal file
118
meme-rater/crawler.py
Normal file
@ -0,0 +1,118 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import aiofiles
|
||||
import os.path
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
|
||||
async def fetch_list_seg(sess, list_url, query):
|
||||
async with sess.get(list_url + ".json", params=query) as res:
|
||||
return await res.json()
|
||||
|
||||
async def fetch_past(sess, list_url, n):
|
||||
after = None
|
||||
count = 0
|
||||
while count < n:
|
||||
args = { "count": 25 }
|
||||
if after is not None: args["after"] = after
|
||||
chunk = await fetch_list_seg(sess, list_url, args)
|
||||
if "data" not in chunk:
|
||||
print("\n", chunk)
|
||||
await asyncio.sleep(400)
|
||||
continue
|
||||
new_items = chunk["data"]["children"]
|
||||
yield [ i["data"] for i in new_items ]
|
||||
count += len(new_items)
|
||||
print("\nup to", count)
|
||||
after = new_items[-1]["data"]["name"]
|
||||
|
||||
SEEN_ITEMS_SIZE = 200
|
||||
async def fetch_stream(sess, list_url):
|
||||
# dicts are ordered, so this is a very janky ordered set implementation
|
||||
seen = {}
|
||||
while True:
|
||||
list_items = (await fetch_list_seg(sess, list_url, {}))["data"]["children"]
|
||||
new = [ i["data"] for i in list_items if i["data"]["name"] not in seen ]
|
||||
# yield the new items
|
||||
for n in new: yield n
|
||||
# add new items to list of seen things
|
||||
seen.update(dict.fromkeys(i["name"] for i in new))
|
||||
# remove old seen items until it's a reasonable size
|
||||
while len(seen) > SEEN_ITEMS_SIZE: seen.pop(next(iter(seen.keys())))
|
||||
# compute average time between posts and wait that long for next fetch cycle
|
||||
times = [ i["data"]["created"] for i in list_items ]
|
||||
timediffs = list(map(lambda x: x[0] - x[1], zip(times, times[1:])))
|
||||
average = sum(timediffs) / len(timediffs)
|
||||
await asyncio.sleep(average)
|
||||
|
||||
def bucket(id): return hashlib.md5(id.encode("utf-8")).hexdigest()[:2]
|
||||
|
||||
filetypes = {
|
||||
"image/png": "png",
|
||||
"image/jpeg": "jpg",
|
||||
"image/webp": "webp",
|
||||
"image/avif": "avif"
|
||||
}
|
||||
|
||||
CHUNK_SIZE = 1<<18 # entirely arbitrary
|
||||
async def download(sess, url, file):
|
||||
async with sess.get(url) as res:
|
||||
ctype = res.headers.get("content-type")
|
||||
if ctype not in filetypes: return
|
||||
if int(res.headers.get("content-length", 1e9)) > 8e6: return
|
||||
async with aiofiles.open(file + "." + filetypes[ctype], mode="wb") as fh:
|
||||
while chunk := await res.content.read(CHUNK_SIZE):
|
||||
await fh.write(chunk)
|
||||
return dict(res.headers)
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
sem = asyncio.Semaphore(16)
|
||||
|
||||
async with aiohttp.ClientSession() as sess:
|
||||
async def download_item(item):
|
||||
#print("starting on", item["name"])
|
||||
print(".", end="")
|
||||
sys.stdout.flush()
|
||||
if item["over_18"] or not item["is_robot_indexable"]: return
|
||||
id = item["name"]
|
||||
bck = bucket(id)
|
||||
os.makedirs(os.path.join("images", bck), exist_ok=True)
|
||||
os.makedirs(os.path.join("meta", bck), exist_ok=True)
|
||||
if not item["url"].startswith("https://"): return
|
||||
meta_path = os.path.join("meta", bck, id + ".json")
|
||||
if not os.path.exists(meta_path): # sorry
|
||||
print("|", end="")
|
||||
sys.stdout.flush()
|
||||
try:
|
||||
result = await download(sess, item["url"], os.path.join("images", bck, id))
|
||||
except Exception as e:
|
||||
print("\nMeme acquisition failure:", e)
|
||||
return
|
||||
if result:
|
||||
item["headers"] = result
|
||||
with open(meta_path, "w") as fh:
|
||||
json.dump(item, fh)
|
||||
else:
|
||||
print("!", end="")
|
||||
sys.stdout.flush()
|
||||
#print("done on", id)
|
||||
|
||||
async def dl_task(item):
|
||||
async with sem:
|
||||
try:
|
||||
await asyncio.wait_for(download_item(item), timeout=30)
|
||||
except asyncio.TimeoutError: pass
|
||||
|
||||
async for items in fetch_past(sess, "https://www.reddit.com/user/osmarks/m/memeharvesting/new", 20000):
|
||||
#print("got new chunk")
|
||||
await sem.acquire()
|
||||
sem.release()
|
||||
#print("downloading new set")
|
||||
async with asyncio.TaskGroup() as tg:
|
||||
for item in items:
|
||||
tg.create_task(dl_task(item))
|
||||
|
||||
asyncio.run(main())
|
6
meme-rater/rater_mse_config.json
Normal file
6
meme-rater/rater_mse_config.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"clip_server": "http://100.64.0.10:1708/",
|
||||
"db_path": "data.sqlite3",
|
||||
"port": 1707,
|
||||
"files": "./images"
|
||||
}
|
111
meme-rater/rater_server.py
Normal file
111
meme-rater/rater_server.py
Normal file
@ -0,0 +1,111 @@
|
||||
from aiohttp import web
|
||||
import aiosqlite
|
||||
import asyncio
|
||||
import random
|
||||
import sys
|
||||
|
||||
PORT, DATABASE = sys.argv[1:]
|
||||
|
||||
app = web.Application(client_max_size=32*1024**2)
|
||||
routes = web.RouteTableDef()
|
||||
|
||||
async def get_pair(db):
|
||||
while True:
|
||||
filenames = [ x[0] for x in await db.execute_fetchall("SELECT filename FROM files", ()) ]
|
||||
m1, m2 = tuple(sorted(random.sample(filenames, 2)))
|
||||
csr = await db.execute("SELECT 1 FROM ratings WHERE meme1 = ? AND meme2 = ?", (m1, m2))
|
||||
if not await csr.fetchone():
|
||||
return m1, m2
|
||||
|
||||
@routes.get("/")
|
||||
async def index(request):
|
||||
meme1, meme2 = await get_pair(request.app["db"])
|
||||
return web.Response(text=f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<style>
|
||||
.memes img {{
|
||||
width: 45%;
|
||||
}}
|
||||
|
||||
@media (max-width: 768px) {{
|
||||
.memes img {{
|
||||
width: 100%;
|
||||
}}
|
||||
}}
|
||||
|
||||
.memes {{
|
||||
margin-top: 2em;
|
||||
}}
|
||||
</style>
|
||||
<body>
|
||||
<h1>Meme Rating</h1>
|
||||
<form action="/rate" method="POST">
|
||||
<input type="radio" name="rating" value="1" id="rating1"> <label for="rating1">Meme 1 is better</label>
|
||||
<input type="radio" name="rating" value="2" id="rating2"> <label for="rating2">Meme 2 is better</label>
|
||||
|
||||
<input type="hidden" name="meme1" value="{meme1}">
|
||||
<input type="hidden" name="meme2" value="{meme2}">
|
||||
<input type="submit" value="Submit">
|
||||
<div class="memes">
|
||||
<img src="/memes/{meme1}" id="meme1">
|
||||
<img src="/memes/{meme2}" id="meme2">
|
||||
</div>
|
||||
</form>
|
||||
<script>
|
||||
document.addEventListener("keypress", function(event) {{
|
||||
if (event.key === "1") {{
|
||||
document.querySelector("input[name='rating'][value='1']").checked = true
|
||||
document.querySelector("form").submit()
|
||||
}} else if (event.key === "2") {{
|
||||
document.querySelector("input[name='rating'][value='2']").checked = true
|
||||
document.querySelector("form").submit()
|
||||
}}
|
||||
}});
|
||||
document.querySelector("#meme1").addEventListener("click", function(event) {{
|
||||
document.querySelector("input[name='rating'][value='1']").checked = true
|
||||
document.querySelector("form").submit()
|
||||
}})
|
||||
document.querySelector("#meme2").addEventListener("click", function(event) {{
|
||||
document.querySelector("input[name='rating'][value='2']").checked = true
|
||||
document.querySelector("form").submit()
|
||||
}})
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
""", content_type="text/html")
|
||||
|
||||
@routes.post("/rate")
|
||||
async def rate(request):
|
||||
db = request.app["db"]
|
||||
post = await request.post()
|
||||
meme1 = post["meme1"]
|
||||
meme2 = post["meme2"]
|
||||
rating = post["rating"]
|
||||
await db.execute("INSERT INTO ratings (meme1, meme2, rating) VALUES (?, ?, ?)", (meme1, meme2, rating))
|
||||
await db.commit()
|
||||
return web.HTTPFound("/")
|
||||
|
||||
async def main():
|
||||
app["db"] = await aiosqlite.connect(DATABASE)
|
||||
await app["db"].executescript("""
|
||||
CREATE TABLE IF NOT EXISTS ratings (
|
||||
meme1 TEXT NOT NULL,
|
||||
meme2 TEXT NOT NULL,
|
||||
rating TEXT NOT NULL,
|
||||
UNIQUE (meme1, meme2)
|
||||
);
|
||||
""")
|
||||
app.router.add_routes(routes)
|
||||
app.router.add_static("/memes/", "./images")
|
||||
print("Ready")
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, "", int(PORT))
|
||||
await site.start()
|
||||
|
||||
if __name__ == "__main__":
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(main())
|
||||
loop.run_forever()
|
Loading…
Reference in New Issue
Block a user