1
0
mirror of https://github.com/osmarks/random-stuff synced 2026-05-14 17:42:10 +00:00

stigler problem

This commit is contained in:
osmarks
2026-04-13 15:44:44 +01:00
parent cea6c91903
commit e7a198e8b8
7 changed files with 15674 additions and 0 deletions
+5907
View File
File diff suppressed because it is too large Load Diff
+177
View File
@@ -0,0 +1,177 @@
import scipy.optimize
import json
import numpy as np
foods = []
constraint_metrics = {
"energy": (2000, 3000),
"fat": (50, 90),
"saturate": (0, 25),
"carbohydrate": (0, 5000),
"sugar": (0, 5000),
"protein": (150, 500),
"salt": (0, 6),
"fibre": (30, 60),
"starch": (0, 5000)
}
# thanks, Codex
protein_quality = {
"protein_supplement": 1.00,
"egg": 1.00,
"dairy": 0.98,
"meat_fish": 1.00,
"soy": 0.95,
"legume": 0.80,
"nuts_seeds": 0.70,
"pseudo_meat": 0.75,
"cereal": 0.55,
"mixed_or_ambiguous": 0.70,
"unknown": 0.65,
}
protein_keywords = {
"protein_supplement": [
"protein powder", "protein shake", "protein bar", "clear whey",
"mass gainer", "meal replacement", "rtd protein",
],
"egg": [
"egg", "eggs", "omelette", "omelet", "frittata", "quiche",
"egg mayo", "egg salad",
],
"dairy": [
"milk", "whole milk", "semi skimmed", "skimmed", "yoghurt", "yogurt",
"greek yoghurt", "greek yogurt", "skyr", "kefir", "buttermilk",
"cheese", "cheddar", "mozzarella", "parmesan", "grana padano",
"pecorino", "edam", "gouda", "brie", "camembert", "stilton",
"halloumi", "paneer", "cottage cheese", "cream cheese", "quark",
"ricotta", "mascarpone", "whey", "casein", "custard",
],
"meat_fish": [
"chicken", "turkey", "beef", "steak", "mince", "pork", "ham",
"bacon", "gammon", "sausage", "salami", "pepperoni", "chorizo",
"prosciutto", "lamb", "mutton", "duck", "venison", "veal",
"liver", "kidney", "black pudding", "pate", "paté",
"fish", "salmon", "tuna", "cod", "haddock", "mackerel", "sardine",
"sardines", "anchovy", "anchovies", "trout", "pollock", "hake",
"seabass", "sea bass", "bream", "prawn", "prawns", "shrimp",
"crab", "lobster", "mussel", "mussels", "clam", "clams",
"oyster", "oysters", "squid", "calamari", "octopus",
],
"soy": [
"soy", "soya", "soybean", "soybeans", "tofu", "tempeh", "edamame",
"miso", "natto", "tvp", "textured vegetable protein",
"soy mince", "soya mince", "soy chunks", "soya chunks",
"soy protein", "soya protein",
],
"legume": [
"bean", "beans", "baked beans", "kidney bean", "kidney beans",
"black bean", "black beans", "pinto", "haricot", "cannellini",
"borlotti", "butter bean", "butter beans", "broad bean", "broad beans",
"fava", "lentil", "lentils", "red lentil", "green lentil",
"puy", "chickpea", "chickpeas", "gram", "split pea", "split peas",
"pea", "peas", "yellow pea", "green pea", "garden peas",
"mung", "adzuki", "azuki", "urad",
"hummus", "houmous", "falafel",
],
"nuts_seeds": [
"peanut", "peanuts", "peanut butter",
"almond", "almonds", "cashew", "cashews", "walnut", "walnuts",
"hazelnut", "hazelnuts", "pecan", "pecans", "pistachio", "pistachios",
"macadamia", "brazil nut", "brazil nuts", "pine nut", "pine nuts",
"mixed nuts",
"seed", "seeds", "pumpkin seed", "pumpkin seeds", "sunflower seed",
"sunflower seeds", "sesame", "tahini", "linseed", "flax", "flaxseed",
"chia", "hemp", "poppy seed", "poppy seeds",
],
"pseudo_meat": [
"seitan", "mycoprotein", "quorn", "meat free", "meat-free",
"plant based", "plant-based", "vegan mince", "veg mince",
"meatless", "veggie burger", "vegetarian burger",
],
"cereal": [
"bread", "wholemeal bread", "wholewheat bread", "toastie", "toast",
"roll", "rolls", "bagel", "bagels", "bap", "baps", "bun", "buns",
"pitta", "pita", "wrap", "wraps", "naan", "flatbread", "crumpet",
"muffin", "english muffin", "teacake", "scone",
"flour", "wheat", "wholewheat", "wholemeal", "bran", "germ",
"semolina", "bulgur", "burghul", "freekeh", "couscous",
"rice", "brown rice", "white rice", "basmati", "jasmine rice",
"wild rice", "risotto rice", "arborio",
"oat", "oats", "oatmeal", "porridge", "muesli", "granola",
"barley", "pearl barley", "rye", "spelt", "farro", "emmer",
"einkorn", "millet", "sorghum", "maize", "corn", "polenta",
"quinoa", "buckwheat", "amaranth", "teff",
"cereal", "breakfast cereal", "flakes", "bran flakes", "cornflakes",
"weetabix", "shredded wheat", "rice krispies", "special k", "cheerios",
"pasta", "wholewheat pasta", "wholemeal pasta", "fresh pasta",
"dried pasta", "egg pasta", "noodle", "noodles", "ramen", "udon",
"soba", "vermicelli", "cappelletti", "tortellini", "ravioli",
"lasagne", "lasagna", "cannelloni", "gnocchi",
"spaghetti", "linguine", "fettuccine", "tagliatelle", "pappardelle",
"fusilli", "penne", "rigatoni", "macaroni", "farfalle", "conchiglie",
"shells", "orecchiette", "bucatini", "capellini", "angel hair",
"tagliolini", "casarecce", "cavatappi", "tortiglioni", "ditalini",
"orzo", "strozzapreti", "radiatori",
],
"mixed_or_ambiguous": [
"ready meal", "meal deal", "sandwich", "burger", "pizza", "pie",
"sausage roll", "pasty", "curry", "stew", "chilli", "chili",
"soup", "salad", "pasta bake", "lasagne al forno", "noodle pot",
],
}
def protein_quality_factor(name):
s = name.lower()
for cls, kws in protein_keywords.items():
for kw in kws:
if kw in s:
return protein_quality[cls]
return protein_quality["unknown"]
mask_data_errors = {
"tesco/293283636",
"tesco/262750183"
}
with open("items.jsonl", "r") as f:
for line in f:
obj = json.loads(line.strip())
if obj["slug"] not in mask_data_errors:
if "protein" in obj["nutrition"]:
obj["nutrition"]["protein"] *= protein_quality_factor(obj["name"])
foods.append(obj)
cost = np.zeros(len(foods))
constraints_mtx = np.zeros((len(foods), len(constraint_metrics) * 2))
for i, food in enumerate(foods):
cost[i] = food["cost"]
for j, metric in enumerate(constraint_metrics):
# greater than/less than
constraints_mtx[i, j*2] = -food["nutrition"].get(metric, 0)
constraints_mtx[i, j*2+1] = food["nutrition"].get(metric, 0)
constraints_vec = np.zeros(len(constraint_metrics)*2)
for j, (lb, ub) in enumerate(constraint_metrics.values()):
constraints_vec[j*2] = -lb
constraints_vec[j*2+1] = ub
print(cost, constraints_mtx, constraints_vec)
res = scipy.optimize.linprog(cost, A_ub=constraints_mtx.T, b_ub=constraints_vec)
print(res)
for i, v in enumerate(res.x):
if v > 0:
print(foods[i], v)
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+7744
View File
File diff suppressed because one or more lines are too long
+75
View File
@@ -0,0 +1,75 @@
from curl_cffi import AsyncSession
from bs4 import BeautifulSoup
import asyncio
import re
import base64
from urllib.parse import unquote
import collections
import json
targets = (
"energy",
"fat",
"saturate",
"carbohydrate",
"sugar",
"protein",
"salt",
"fibre",
"starch"
)
def fix_commas(x):
return re.sub(r"(\d+),(\d{1,2})", r"\1.\2", re.sub(r"(\d+),(\d{3})", r"\1\2", x))
def process():
with open("tesco.jsonl", "r") as f:
with open("items.jsonl", "a") as h:
for line in f:
obj = json.loads(line.strip())
cost = obj["price"]
values = {}
#print(obj["title"], obj["details"]["nutrition"])
nutrition = obj["details"]["nutrition"]
try:
for target in targets:
# TODO check perComp first line is reasonable
for row in nutrition:
label = row["name"]
value = row["perComp"]
if label and target.lower() in label.lower():
value = fix_commas(value.lower().removeprefix("(").removeprefix("nutritioninformation/").split("/")[0].split("(")[0].strip().split("kcal")[0])
if value.lower() == "trace" or value == "-" or value == "nil": value = "0"
is_kj = "kj" in value or "kj" in label.split("/")[0].strip()
if target not in values:
if is_kj:
value = value.split("kj")[0].removeprefix("<").strip().replace(" ", "")
value = float(value) / 4.2 # kcal
elif value.endswith("%"):
value = value.removeprefix("less than").strip().removeprefix("<").removesuffix("%")
value = float(value) / 100 * 1000
else:
value = value.removeprefix("less than").replace(" ", "").removeprefix("<").removesuffix(")").removesuffix("*").removesuffix("g").removesuffix("calories").removeprefix("=")
value = float(value)
values[target] = value
break
except:
import traceback
traceback.print_exc()
if cost["unitOfMeasure"] not in {"ltr", "kg"}:
continue
cost = cost["unitPrice"]
if values:
json.dump({
"slug": "tesco/" + obj["id"],
"nutrition": values,
"cost": cost,
"name": obj["title"]
}, h)
h.write("\n")
process()
+383
View File
@@ -0,0 +1,383 @@
import argparse
import asyncio
import json
import sys
from curl_cffi import AsyncSession
XAPI_URL = "https://xapi.tesco.com/"
# This key is shipped to browsers in Tesco's own groceries page config.
XAPI_KEY = "TvOSZJHlEk0pjniDGQFAc9Q59WGAR4dA"
HEADERS = {
"accept": "application/json",
"content-type": "application/json",
"origin": "https://www.tesco.com",
"referer": "https://www.tesco.com/groceries/en-GB/search?consumer=ghsapp-uk",
"x-apikey": XAPI_KEY,
}
DEFAULT_TERMS = [
"bread",
"butter",
"cheese",
"egg",
"flour",
"milk",
"oats",
"pasta",
"potato",
"rice",
"tomato",
"vegetable",
]
TAXONOMY_QUERY = """
query Taxonomy($includeInspirationEvents: Boolean = false, $configs: [ConfigArgType]) {
taxonomy(includeInspirationEvents: $includeInspirationEvents, configs: $configs) {
catId: id
name
label
parent
children {
catId: id
name
label
parent
children {
catId: id
name
label
parent
children {
catId: id
name
label
parent
}
}
}
}
}
"""
SEARCH_QUERY = """
query Search($query: String!, $page: Int, $count: Int, $sortBy: String) {
search(query: $query, page: $page, count: $count, sortBy: $sortBy) {
results {
node {
... on ProductInterface {
id
tpnb
tpnc
gtin
title
brandName
superDepartmentName
departmentName
aisleName
shelfName
price {
actual
unitPrice
unitOfMeasure
}
}
}
}
}
}
"""
CATEGORY_QUERY = """
query Category($facet: ID, $page: Int, $count: Int, $sortBy: String) {
category(facet: $facet, page: $page, count: $count, sortBy: $sortBy) {
info {
total
page
count
pageSize
offset
}
results {
node {
... on ProductInterface {
id
tpnb
tpnc
gtin
title
brandName
superDepartmentName
departmentName
aisleName
shelfName
price {
actual
unitPrice
unitOfMeasure
}
}
}
}
}
}
"""
PRODUCT_QUERY = """
query GetProduct($tpnc: String) {
product(tpnc: $tpnc) {
id
tpnb
tpnc
gtin
title
brandName
description
foodIcons
superDepartmentName
departmentName
aisleName
shelfName
price {
actual
unitPrice
unitOfMeasure
}
details {
ingredients
netContents
packSize {
value
units
}
nutrition {
name
perComp: value1
perServing: value2
referenceIntake: value3
referencePercentage: value4
}
guidelineDailyAmount {
title
dailyAmounts {
name
value
percent
rating
}
}
productMarketing
preparationAndUsage
storage
features
healthClaims
}
}
}
"""
async def graphql(session, query, variables):
while True:
response = await session.post(
XAPI_URL,
json={"query": query, "variables": variables},
)
if response.status_code != 504:
response.raise_for_status()
payload = response.json()
if payload.get("errors"):
print(json.dumps(payload["errors"], indent=2))
else:
return payload["data"]
async def search_page(session, term, page, count):
data = await graphql(
session,
SEARCH_QUERY,
{"query": term, "page": page, "count": count, "sortBy": "price-ascending"},
)
nodes = []
for result in data["search"]["results"]:
node = result.get("node")
if node and node.get("tpnc"):
nodes.append(node)
return nodes
async def category_page(session, cat_id, page, count):
data = await graphql(
session,
CATEGORY_QUERY,
{"facet": cat_id, "page": page, "count": count, "sortBy": "price-ascending"},
)
category = data["category"]
nodes = []
for result in category["results"]:
node = result.get("node")
if node and node.get("tpnc"):
nodes.append(node)
return category["info"], nodes
async def get_taxonomy(session):
data = await graphql(
session,
TAXONOMY_QUERY,
{"includeInspirationEvents": False, "configs": []},
)
return data["taxonomy"]
def flatten_taxonomy(nodes, path=()):
for node in nodes:
current_path = (*path, node["name"])
yield {
"catId": node["catId"],
"name": node["name"],
"label": node["label"],
"path": " > ".join(current_path),
}
yield from flatten_taxonomy(node.get("children") or [], current_path)
def category_filter(categories, args):
selected = categories
if args.category_label:
selected = [c for c in selected if c["label"] in args.category_label]
if args.category_contains:
needles = [needle.lower() for needle in args.category_contains]
selected = [
c for c in selected
if any(needle in c["path"].lower() for needle in needles)
]
return selected
async def get_product(session, tpnc):
data = await graphql(session, PRODUCT_QUERY, {"tpnc": tpnc})
return data["product"]
async def bounded_gather(limit, items, fn):
semaphore = asyncio.Semaphore(limit)
async def run(item):
async with semaphore:
return await fn(item)
return await asyncio.gather(*(run(item) for item in items))
async def scrape(args):
seen = set()
async with AsyncSession(headers=HEADERS, impersonate="chrome120", timeout=30) as session:
with open(args.output, "w") as output:
if args.categories:
taxonomy = await get_taxonomy(session)
categories = category_filter(list(flatten_taxonomy(taxonomy)), args)
print(f"{len(categories)} categories selected", file=sys.stderr)
for category in categories:
print(f"category {category['path']!r}", file=sys.stderr)
for page in range(1, args.pages + 1):
info, rows = await category_page(
session, category["catId"], page, args.count
)
if not rows:
break
rows = [row for row in rows if row["tpnc"] not in seen]
for row in rows:
seen.add(row["tpnc"])
products = await bounded_gather(
args.concurrency,
rows,
lambda row: get_product(session, row["tpnc"]),
)
for product in products:
product["matchedCategory"] = category
json.dump(product, output, ensure_ascii=False)
output.write("\n")
output.flush()
print(
f" page {page}: {len(products)} new products "
f"({info['total']} listed)",
file=sys.stderr,
)
if info["offset"] + info["count"] >= info["total"]:
break
return
terms = args.term or DEFAULT_TERMS
for term in terms:
print(f"search {term!r}", file=sys.stderr)
for page in range(1, args.pages + 1):
rows = await search_page(session, term, page, args.count)
if not rows:
break
rows = [row for row in rows if row["tpnc"] not in seen]
for row in rows:
seen.add(row["tpnc"])
products = await bounded_gather(
args.concurrency,
rows,
lambda row: get_product(session, row["tpnc"]),
)
for product in products:
product["matchedSearchTerm"] = term
json.dump(product, output, ensure_ascii=False)
output.write("\n")
output.flush()
print(
f" page {page}: {len(products)} new products",
file=sys.stderr,
)
if len(rows) < args.count:
break
def parse_args():
parser = argparse.ArgumentParser(
description="Scrape Tesco-owned price and nutrition data through Tesco XAPI."
)
parser.add_argument("term", nargs="*", help="Search terms to scrape")
parser.add_argument("-o", "--output", default="tesco.jsonl")
parser.add_argument(
"--categories",
action="store_true",
help="Scrape Tesco taxonomy categories instead of keyword search.",
)
parser.add_argument(
"--category-label",
action="append",
choices=["superDepartment", "department", "aisle", "shelf"],
help="Restrict --categories to a taxonomy level. Repeatable.",
)
parser.add_argument(
"--category-contains",
action="append",
help="Restrict --categories to paths containing this text. Repeatable.",
)
parser.add_argument("--pages", type=int, default=2)
parser.add_argument("--count", type=int, default=48)
parser.add_argument("--concurrency", type=int, default=8)
return parser.parse_args()
if __name__ == "__main__":
asyncio.run(scrape(parse_args()))