stigler problem

2026-06-05 12:22:11 +00:00 · 2026-04-13 15:44:44 +01:00
parent cea6c91903
commit e7a198e8b8
7 changed files with 15674 additions and 0 deletions
@@ -0,0 +1,177 @@
+import scipy.optimize
+import json
+import numpy as np
+
+foods = []
+
+constraint_metrics = {
+    "energy": (2000, 3000),
+    "fat": (50, 90),
+    "saturate": (0, 25),
+    "carbohydrate": (0, 5000),
+    "sugar": (0, 5000),
+    "protein": (150, 500),
+    "salt": (0, 6),
+    "fibre": (30, 60),
+    "starch": (0, 5000)
+}
+
+# thanks, Codex
+protein_quality = {
+    "protein_supplement": 1.00,
+    "egg": 1.00,
+    "dairy": 0.98,
+    "meat_fish": 1.00,
+    "soy": 0.95,
+    "legume": 0.80,
+    "nuts_seeds": 0.70,
+    "pseudo_meat": 0.75,
+    "cereal": 0.55,
+    "mixed_or_ambiguous": 0.70,
+    "unknown": 0.65,
+}
+
+protein_keywords = {
+    "protein_supplement": [
+        "protein powder", "protein shake", "protein bar", "clear whey",
+        "mass gainer", "meal replacement", "rtd protein",
+    ],
+
+    "egg": [
+        "egg", "eggs", "omelette", "omelet", "frittata", "quiche",
+        "egg mayo", "egg salad",
+    ],
+
+    "dairy": [
+        "milk", "whole milk", "semi skimmed", "skimmed", "yoghurt", "yogurt",
+        "greek yoghurt", "greek yogurt", "skyr", "kefir", "buttermilk",
+        "cheese", "cheddar", "mozzarella", "parmesan", "grana padano",
+        "pecorino", "edam", "gouda", "brie", "camembert", "stilton",
+        "halloumi", "paneer", "cottage cheese", "cream cheese", "quark",
+        "ricotta", "mascarpone", "whey", "casein", "custard",
+    ],
+
+    "meat_fish": [
+        "chicken", "turkey", "beef", "steak", "mince", "pork", "ham",
+        "bacon", "gammon", "sausage", "salami", "pepperoni", "chorizo",
+        "prosciutto", "lamb", "mutton", "duck", "venison", "veal",
+        "liver", "kidney", "black pudding", "pate", "paté",
+        "fish", "salmon", "tuna", "cod", "haddock", "mackerel", "sardine",
+        "sardines", "anchovy", "anchovies", "trout", "pollock", "hake",
+        "seabass", "sea bass", "bream", "prawn", "prawns", "shrimp",
+        "crab", "lobster", "mussel", "mussels", "clam", "clams",
+        "oyster", "oysters", "squid", "calamari", "octopus",
+    ],
+
+    "soy": [
+        "soy", "soya", "soybean", "soybeans", "tofu", "tempeh", "edamame",
+        "miso", "natto", "tvp", "textured vegetable protein",
+        "soy mince", "soya mince", "soy chunks", "soya chunks",
+        "soy protein", "soya protein",
+    ],
+
+    "legume": [
+        "bean", "beans", "baked beans", "kidney bean", "kidney beans",
+        "black bean", "black beans", "pinto", "haricot", "cannellini",
+        "borlotti", "butter bean", "butter beans", "broad bean", "broad beans",
+        "fava", "lentil", "lentils", "red lentil", "green lentil",
+        "puy", "chickpea", "chickpeas", "gram", "split pea", "split peas",
+        "pea", "peas", "yellow pea", "green pea", "garden peas",
+        "mung", "adzuki", "azuki", "urad",
+        "hummus", "houmous", "falafel",
+    ],
+
+    "nuts_seeds": [
+        "peanut", "peanuts", "peanut butter",
+        "almond", "almonds", "cashew", "cashews", "walnut", "walnuts",
+        "hazelnut", "hazelnuts", "pecan", "pecans", "pistachio", "pistachios",
+        "macadamia", "brazil nut", "brazil nuts", "pine nut", "pine nuts",
+        "mixed nuts",
+        "seed", "seeds", "pumpkin seed", "pumpkin seeds", "sunflower seed",
+        "sunflower seeds", "sesame", "tahini", "linseed", "flax", "flaxseed",
+        "chia", "hemp", "poppy seed", "poppy seeds",
+    ],
+
+    "pseudo_meat": [
+        "seitan", "mycoprotein", "quorn", "meat free", "meat-free",
+        "plant based", "plant-based", "vegan mince", "veg mince",
+        "meatless", "veggie burger", "vegetarian burger",
+    ],
+
+    "cereal": [
+        "bread", "wholemeal bread", "wholewheat bread", "toastie", "toast",
+        "roll", "rolls", "bagel", "bagels", "bap", "baps", "bun", "buns",
+        "pitta", "pita", "wrap", "wraps", "naan", "flatbread", "crumpet",
+        "muffin", "english muffin", "teacake", "scone",
+        "flour", "wheat", "wholewheat", "wholemeal", "bran", "germ",
+        "semolina", "bulgur", "burghul", "freekeh", "couscous",
+        "rice", "brown rice", "white rice", "basmati", "jasmine rice",
+        "wild rice", "risotto rice", "arborio",
+        "oat", "oats", "oatmeal", "porridge", "muesli", "granola",
+        "barley", "pearl barley", "rye", "spelt", "farro", "emmer",
+        "einkorn", "millet", "sorghum", "maize", "corn", "polenta",
+        "quinoa", "buckwheat", "amaranth", "teff",
+        "cereal", "breakfast cereal", "flakes", "bran flakes", "cornflakes",
+        "weetabix", "shredded wheat", "rice krispies", "special k", "cheerios",
+        "pasta", "wholewheat pasta", "wholemeal pasta", "fresh pasta",
+        "dried pasta", "egg pasta", "noodle", "noodles", "ramen", "udon",
+        "soba", "vermicelli", "cappelletti", "tortellini", "ravioli",
+        "lasagne", "lasagna", "cannelloni", "gnocchi",
+        "spaghetti", "linguine", "fettuccine", "tagliatelle", "pappardelle",
+        "fusilli", "penne", "rigatoni", "macaroni", "farfalle", "conchiglie",
+        "shells", "orecchiette", "bucatini", "capellini", "angel hair",
+        "tagliolini", "casarecce", "cavatappi", "tortiglioni", "ditalini",
+        "orzo", "strozzapreti", "radiatori",
+    ],
+
+    "mixed_or_ambiguous": [
+        "ready meal", "meal deal", "sandwich", "burger", "pizza", "pie",
+        "sausage roll", "pasty", "curry", "stew", "chilli", "chili",
+        "soup", "salad", "pasta bake", "lasagne al forno", "noodle pot",
+    ],
+}
+
+def protein_quality_factor(name):
+    s = name.lower()
+    for cls, kws in protein_keywords.items():
+        for kw in kws:
+            if kw in s:
+                return protein_quality[cls]
+    return protein_quality["unknown"]
+
+mask_data_errors = {
+    "tesco/293283636",
+    "tesco/262750183"
+}
+
+with open("items.jsonl", "r") as f:
+    for line in f:
+        obj = json.loads(line.strip())
+        if obj["slug"] not in mask_data_errors:
+            if "protein" in obj["nutrition"]:
+                obj["nutrition"]["protein"] *= protein_quality_factor(obj["name"])
+            foods.append(obj)
+
+cost = np.zeros(len(foods))
+constraints_mtx = np.zeros((len(foods), len(constraint_metrics) * 2))
+
+for i, food in enumerate(foods):
+    cost[i] = food["cost"]
+    for j, metric in enumerate(constraint_metrics):
+        # greater than/less than
+        constraints_mtx[i, j*2] = -food["nutrition"].get(metric, 0)
+        constraints_mtx[i, j*2+1] = food["nutrition"].get(metric, 0)
+
+constraints_vec = np.zeros(len(constraint_metrics)*2)
+for j, (lb, ub) in enumerate(constraint_metrics.values()):
+    constraints_vec[j*2] = -lb
+    constraints_vec[j*2+1] = ub
+
+print(cost, constraints_mtx, constraints_vec)
+
+res = scipy.optimize.linprog(cost, A_ub=constraints_mtx.T, b_ub=constraints_vec)
+print(res)
+
+for i, v in enumerate(res.x):
+    if v > 0:
+       print(foods[i], v)
@@ -0,0 +1,75 @@
+from curl_cffi import AsyncSession
+from bs4 import BeautifulSoup
+import asyncio
+import re
+import base64
+from urllib.parse import unquote
+import collections
+import json
+
+targets = (
+    "energy",
+    "fat",
+    "saturate",
+    "carbohydrate",
+    "sugar",
+    "protein",
+    "salt",
+    "fibre",
+    "starch"
+)
+
+def fix_commas(x):
+    return re.sub(r"(\d+),(\d{1,2})", r"\1.\2", re.sub(r"(\d+),(\d{3})", r"\1\2", x))
+
+def process():
+    with open("tesco.jsonl", "r") as f:
+        with open("items.jsonl", "a") as h:
+            for line in f:
+                obj = json.loads(line.strip())
+                cost = obj["price"]
+                values = {}
+
+                #print(obj["title"], obj["details"]["nutrition"])
+                nutrition = obj["details"]["nutrition"]
+
+                try:
+                    for target in targets:
+                        # TODO check perComp first line is reasonable
+                        for row in nutrition:
+                            label = row["name"]
+                            value = row["perComp"]
+                            if label and target.lower() in label.lower():
+                                value = fix_commas(value.lower().removeprefix("(").removeprefix("nutritioninformation/").split("/")[0].split("(")[0].strip().split("kcal")[0])
+                                if value.lower() == "trace" or value == "-" or value == "nil": value = "0"
+                                is_kj = "kj" in value or "kj" in label.split("/")[0].strip()
+                                if target not in values:
+                                    if is_kj:
+                                        value = value.split("kj")[0].removeprefix("<").strip().replace(" ", "")
+                                        value = float(value) / 4.2 # kcal
+                                    elif value.endswith("%"):
+                                        value = value.removeprefix("less than").strip().removeprefix("<").removesuffix("%")
+                                        value = float(value) / 100 * 1000
+                                    else:
+                                        value = value.removeprefix("less than").replace(" ", "").removeprefix("<").removesuffix(")").removesuffix("*").removesuffix("g").removesuffix("calories").removeprefix("=")
+                                        value = float(value)
+                                    values[target] = value
+                                    break
+                except:
+                    import traceback
+                    traceback.print_exc()
+
+                if cost["unitOfMeasure"] not in {"ltr", "kg"}:
+                    continue
+                cost = cost["unitPrice"]
+
+                if values:
+                    json.dump({
+                        "slug": "tesco/" + obj["id"],
+                        "nutrition": values,
+                        "cost": cost,
+                        "name": obj["title"]
+                    }, h)
+                    h.write("\n")
+
+process()
@@ -0,0 +1,383 @@
+import argparse
+import asyncio
+import json
+import sys
+
+from curl_cffi import AsyncSession
+
+
+XAPI_URL = "https://xapi.tesco.com/"
+
+# This key is shipped to browsers in Tesco's own groceries page config.
+XAPI_KEY = "TvOSZJHlEk0pjniDGQFAc9Q59WGAR4dA"
+
+HEADERS = {
+    "accept": "application/json",
+    "content-type": "application/json",
+    "origin": "https://www.tesco.com",
+    "referer": "https://www.tesco.com/groceries/en-GB/search?consumer=ghsapp-uk",
+    "x-apikey": XAPI_KEY,
+}
+
+DEFAULT_TERMS = [
+    "bread",
+    "butter",
+    "cheese",
+    "egg",
+    "flour",
+    "milk",
+    "oats",
+    "pasta",
+    "potato",
+    "rice",
+    "tomato",
+    "vegetable",
+]
+
+TAXONOMY_QUERY = """
+query Taxonomy($includeInspirationEvents: Boolean = false, $configs: [ConfigArgType]) {
+  taxonomy(includeInspirationEvents: $includeInspirationEvents, configs: $configs) {
+    catId: id
+    name
+    label
+    parent
+    children {
+      catId: id
+      name
+      label
+      parent
+      children {
+        catId: id
+        name
+        label
+        parent
+        children {
+          catId: id
+          name
+          label
+          parent
+        }
+      }
+    }
+  }
+}
+"""
+
+SEARCH_QUERY = """
+query Search($query: String!, $page: Int, $count: Int, $sortBy: String) {
+  search(query: $query, page: $page, count: $count, sortBy: $sortBy) {
+    results {
+      node {
+        ... on ProductInterface {
+          id
+          tpnb
+          tpnc
+          gtin
+          title
+          brandName
+          superDepartmentName
+          departmentName
+          aisleName
+          shelfName
+          price {
+            actual
+            unitPrice
+            unitOfMeasure
+          }
+        }
+      }
+    }
+  }
+}
+"""
+
+CATEGORY_QUERY = """
+query Category($facet: ID, $page: Int, $count: Int, $sortBy: String) {
+  category(facet: $facet, page: $page, count: $count, sortBy: $sortBy) {
+    info {
+      total
+      page
+      count
+      pageSize
+      offset
+    }
+    results {
+      node {
+        ... on ProductInterface {
+          id
+          tpnb
+          tpnc
+          gtin
+          title
+          brandName
+          superDepartmentName
+          departmentName
+          aisleName
+          shelfName
+          price {
+            actual
+            unitPrice
+            unitOfMeasure
+          }
+        }
+      }
+    }
+  }
+}
+"""
+
+PRODUCT_QUERY = """
+query GetProduct($tpnc: String) {
+  product(tpnc: $tpnc) {
+    id
+    tpnb
+    tpnc
+    gtin
+    title
+    brandName
+    description
+    foodIcons
+    superDepartmentName
+    departmentName
+    aisleName
+    shelfName
+    price {
+      actual
+      unitPrice
+      unitOfMeasure
+    }
+    details {
+      ingredients
+      netContents
+      packSize {
+        value
+        units
+      }
+      nutrition {
+        name
+        perComp: value1
+        perServing: value2
+        referenceIntake: value3
+        referencePercentage: value4
+      }
+      guidelineDailyAmount {
+        title
+        dailyAmounts {
+          name
+          value
+          percent
+          rating
+        }
+      }
+      productMarketing
+      preparationAndUsage
+      storage
+      features
+      healthClaims
+    }
+  }
+}
+"""
+
+
+async def graphql(session, query, variables):
+    while True:
+        response = await session.post(
+            XAPI_URL,
+            json={"query": query, "variables": variables},
+        )
+        if response.status_code != 504:
+            response.raise_for_status()
+        payload = response.json()
+        if payload.get("errors"):
+            print(json.dumps(payload["errors"], indent=2))
+        else:
+            return payload["data"]
+
+
+async def search_page(session, term, page, count):
+    data = await graphql(
+        session,
+        SEARCH_QUERY,
+        {"query": term, "page": page, "count": count, "sortBy": "price-ascending"},
+    )
+    nodes = []
+    for result in data["search"]["results"]:
+        node = result.get("node")
+        if node and node.get("tpnc"):
+            nodes.append(node)
+    return nodes
+
+
+async def category_page(session, cat_id, page, count):
+    data = await graphql(
+        session,
+        CATEGORY_QUERY,
+        {"facet": cat_id, "page": page, "count": count, "sortBy": "price-ascending"},
+    )
+    category = data["category"]
+    nodes = []
+    for result in category["results"]:
+        node = result.get("node")
+        if node and node.get("tpnc"):
+            nodes.append(node)
+    return category["info"], nodes
+
+
+async def get_taxonomy(session):
+    data = await graphql(
+        session,
+        TAXONOMY_QUERY,
+        {"includeInspirationEvents": False, "configs": []},
+    )
+    return data["taxonomy"]
+
+
+def flatten_taxonomy(nodes, path=()):
+    for node in nodes:
+        current_path = (*path, node["name"])
+        yield {
+            "catId": node["catId"],
+            "name": node["name"],
+            "label": node["label"],
+            "path": " > ".join(current_path),
+        }
+        yield from flatten_taxonomy(node.get("children") or [], current_path)
+
+
+def category_filter(categories, args):
+    selected = categories
+    if args.category_label:
+        selected = [c for c in selected if c["label"] in args.category_label]
+    if args.category_contains:
+        needles = [needle.lower() for needle in args.category_contains]
+        selected = [
+            c for c in selected
+            if any(needle in c["path"].lower() for needle in needles)
+        ]
+    return selected
+
+
+async def get_product(session, tpnc):
+    data = await graphql(session, PRODUCT_QUERY, {"tpnc": tpnc})
+    return data["product"]
+
+
+async def bounded_gather(limit, items, fn):
+    semaphore = asyncio.Semaphore(limit)
+
+    async def run(item):
+        async with semaphore:
+            return await fn(item)
+
+    return await asyncio.gather(*(run(item) for item in items))
+
+
+async def scrape(args):
+    seen = set()
+
+    async with AsyncSession(headers=HEADERS, impersonate="chrome120", timeout=30) as session:
+        with open(args.output, "w") as output:
+            if args.categories:
+                taxonomy = await get_taxonomy(session)
+                categories = category_filter(list(flatten_taxonomy(taxonomy)), args)
+                print(f"{len(categories)} categories selected", file=sys.stderr)
+
+                for category in categories:
+                    print(f"category {category['path']!r}", file=sys.stderr)
+                    for page in range(1, args.pages + 1):
+                        info, rows = await category_page(
+                            session, category["catId"], page, args.count
+                        )
+                        if not rows:
+                            break
+
+                        rows = [row for row in rows if row["tpnc"] not in seen]
+                        for row in rows:
+                            seen.add(row["tpnc"])
+
+                        products = await bounded_gather(
+                            args.concurrency,
+                            rows,
+                            lambda row: get_product(session, row["tpnc"]),
+                        )
+
+                        for product in products:
+                            product["matchedCategory"] = category
+                            json.dump(product, output, ensure_ascii=False)
+                            output.write("\n")
+
+                        output.flush()
+                        print(
+                            f"  page {page}: {len(products)} new products "
+                            f"({info['total']} listed)",
+                            file=sys.stderr,
+                        )
+
+                        if info["offset"] + info["count"] >= info["total"]:
+                            break
+                return
+
+            terms = args.term or DEFAULT_TERMS
+            for term in terms:
+                print(f"search {term!r}", file=sys.stderr)
+                for page in range(1, args.pages + 1):
+                    rows = await search_page(session, term, page, args.count)
+                    if not rows:
+                        break
+
+                    rows = [row for row in rows if row["tpnc"] not in seen]
+                    for row in rows:
+                        seen.add(row["tpnc"])
+
+                    products = await bounded_gather(
+                        args.concurrency,
+                        rows,
+                        lambda row: get_product(session, row["tpnc"]),
+                    )
+
+                    for product in products:
+                        product["matchedSearchTerm"] = term
+                        json.dump(product, output, ensure_ascii=False)
+                        output.write("\n")
+
+                    output.flush()
+                    print(
+                        f"  page {page}: {len(products)} new products",
+                        file=sys.stderr,
+                    )
+
+                    if len(rows) < args.count:
+                        break
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Scrape Tesco-owned price and nutrition data through Tesco XAPI."
+    )
+    parser.add_argument("term", nargs="*", help="Search terms to scrape")
+    parser.add_argument("-o", "--output", default="tesco.jsonl")
+    parser.add_argument(
+        "--categories",
+        action="store_true",
+        help="Scrape Tesco taxonomy categories instead of keyword search.",
+    )
+    parser.add_argument(
+        "--category-label",
+        action="append",
+        choices=["superDepartment", "department", "aisle", "shelf"],
+        help="Restrict --categories to a taxonomy level. Repeatable.",
+    )
+    parser.add_argument(
+        "--category-contains",
+        action="append",
+        help="Restrict --categories to paths containing this text. Repeatable.",
+    )
+    parser.add_argument("--pages", type=int, default=2)
+    parser.add_argument("--count", type=int, default=48)
+    parser.add_argument("--concurrency", type=int, default=8)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    asyncio.run(scrape(parse_args()))