meme-search-engine/meme-rater/crawler.py

import aiohttp
import asyncio
import aiofiles
import os.path
import hashlib
import json
import time
import sys

async def fetch_list_seg(sess, list_url, query):
    async with sess.get(list_url + ".json", params=query) as res:
        return await res.json()

async def fetch_past(sess, list_url, n):
    after = None
    count = 0
    while count < n:
        args = { "count": 25 }
        if after is not None: args["after"] = after
        chunk = await fetch_list_seg(sess, list_url, args)
        if "data" not in chunk:
            print("\n", chunk)
            await asyncio.sleep(400)
            continue
        new_items = chunk["data"]["children"]
        yield [ i["data"] for i in new_items ]
        count += len(new_items)
        print("\nup to", count)
        after = new_items[-1]["data"]["name"]

SEEN_ITEMS_SIZE = 200
async def fetch_stream(sess, list_url):
    # dicts are ordered, so this is a very janky ordered set implementation
    seen = {}
    while True:
        list_items = (await fetch_list_seg(sess, list_url, {}))["data"]["children"]
        new = [ i["data"] for i in list_items if i["data"]["name"] not in seen ]
        # yield the new items
        for n in new: yield n
        # add new items to list of seen things
        seen.update(dict.fromkeys(i["name"] for i in new))
        # remove old seen items until it's a reasonable size
        while len(seen) > SEEN_ITEMS_SIZE: seen.pop(next(iter(seen.keys())))
        # compute average time between posts and wait that long for next fetch cycle
        times = [ i["data"]["created"] for i in list_items ]
        timediffs = list(map(lambda x: x[0] - x[1], zip(times, times[1:])))
        average = sum(timediffs) / len(timediffs)
        await asyncio.sleep(average)

def bucket(id): return hashlib.md5(id.encode("utf-8")).hexdigest()[:2]

filetypes = {
    "image/png": "png",
    "image/jpeg": "jpg",
    "image/webp": "webp",
    "image/avif": "avif"
}

CHUNK_SIZE = 1<<18 # entirely arbitrary
async def download(sess, url, file):
    async with sess.get(url) as res:
        ctype = res.headers.get("content-type")
        if ctype not in filetypes: return
        if int(res.headers.get("content-length", 1e9)) > 8e6: return
        async with aiofiles.open(file + "." + filetypes[ctype], mode="wb") as fh:
            while chunk := await res.content.read(CHUNK_SIZE):
                await fh.write(chunk)
        return dict(res.headers)

if __name__ == "__main__":
    async def main():
        sem = asyncio.Semaphore(16)

        async with aiohttp.ClientSession() as sess:
            async def download_item(item):
                #print("starting on", item["name"])
                print(".", end="")
                sys.stdout.flush()
                if item["over_18"] or not item["is_robot_indexable"]: return
                id = item["name"]
                bck = bucket(id)
                os.makedirs(os.path.join("images", bck), exist_ok=True)
                os.makedirs(os.path.join("meta", bck), exist_ok=True)
                if not item["url"].startswith("https://"): return
                meta_path = os.path.join("meta", bck, id + ".json")
                if not os.path.exists(meta_path): # sorry
                    print("|", end="")
                    sys.stdout.flush()
                    try:
                        result = await download(sess, item["url"], os.path.join("images", bck, id))
                    except Exception as e:
                        print("\nMeme acquisition failure:", e)
                        return
                    if result:
                        item["headers"] = result
                        with open(meta_path, "w") as fh:
                            json.dump(item, fh)
                    else:
                        print("!", end="")
                        sys.stdout.flush()
                #print("done on", id)

            async def dl_task(item):
                async with sem:
                    try:
                        await asyncio.wait_for(download_item(item), timeout=30)
                    except asyncio.TimeoutError: pass

            async for items in fetch_past(sess, "https://www.reddit.com/user/osmarks/m/memeharvesting/new", 20000):
                #print("got new chunk")
                await sem.acquire()
                sem.release()
                #print("downloading new set")
                async with asyncio.TaskGroup() as tg:
                    for item in items:
                        tg.create_task(dl_task(item))

    asyncio.run(main())