mirror of
https://github.com/osmarks/random-stuff
synced 2026-06-07 05:12:11 +00:00
stigler problem
This commit is contained in:
@@ -0,0 +1,383 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
|
||||
from curl_cffi import AsyncSession
|
||||
|
||||
|
||||
XAPI_URL = "https://xapi.tesco.com/"
|
||||
|
||||
# This key is shipped to browsers in Tesco's own groceries page config.
|
||||
XAPI_KEY = "TvOSZJHlEk0pjniDGQFAc9Q59WGAR4dA"
|
||||
|
||||
HEADERS = {
|
||||
"accept": "application/json",
|
||||
"content-type": "application/json",
|
||||
"origin": "https://www.tesco.com",
|
||||
"referer": "https://www.tesco.com/groceries/en-GB/search?consumer=ghsapp-uk",
|
||||
"x-apikey": XAPI_KEY,
|
||||
}
|
||||
|
||||
DEFAULT_TERMS = [
|
||||
"bread",
|
||||
"butter",
|
||||
"cheese",
|
||||
"egg",
|
||||
"flour",
|
||||
"milk",
|
||||
"oats",
|
||||
"pasta",
|
||||
"potato",
|
||||
"rice",
|
||||
"tomato",
|
||||
"vegetable",
|
||||
]
|
||||
|
||||
TAXONOMY_QUERY = """
|
||||
query Taxonomy($includeInspirationEvents: Boolean = false, $configs: [ConfigArgType]) {
|
||||
taxonomy(includeInspirationEvents: $includeInspirationEvents, configs: $configs) {
|
||||
catId: id
|
||||
name
|
||||
label
|
||||
parent
|
||||
children {
|
||||
catId: id
|
||||
name
|
||||
label
|
||||
parent
|
||||
children {
|
||||
catId: id
|
||||
name
|
||||
label
|
||||
parent
|
||||
children {
|
||||
catId: id
|
||||
name
|
||||
label
|
||||
parent
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
SEARCH_QUERY = """
|
||||
query Search($query: String!, $page: Int, $count: Int, $sortBy: String) {
|
||||
search(query: $query, page: $page, count: $count, sortBy: $sortBy) {
|
||||
results {
|
||||
node {
|
||||
... on ProductInterface {
|
||||
id
|
||||
tpnb
|
||||
tpnc
|
||||
gtin
|
||||
title
|
||||
brandName
|
||||
superDepartmentName
|
||||
departmentName
|
||||
aisleName
|
||||
shelfName
|
||||
price {
|
||||
actual
|
||||
unitPrice
|
||||
unitOfMeasure
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
CATEGORY_QUERY = """
|
||||
query Category($facet: ID, $page: Int, $count: Int, $sortBy: String) {
|
||||
category(facet: $facet, page: $page, count: $count, sortBy: $sortBy) {
|
||||
info {
|
||||
total
|
||||
page
|
||||
count
|
||||
pageSize
|
||||
offset
|
||||
}
|
||||
results {
|
||||
node {
|
||||
... on ProductInterface {
|
||||
id
|
||||
tpnb
|
||||
tpnc
|
||||
gtin
|
||||
title
|
||||
brandName
|
||||
superDepartmentName
|
||||
departmentName
|
||||
aisleName
|
||||
shelfName
|
||||
price {
|
||||
actual
|
||||
unitPrice
|
||||
unitOfMeasure
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
PRODUCT_QUERY = """
|
||||
query GetProduct($tpnc: String) {
|
||||
product(tpnc: $tpnc) {
|
||||
id
|
||||
tpnb
|
||||
tpnc
|
||||
gtin
|
||||
title
|
||||
brandName
|
||||
description
|
||||
foodIcons
|
||||
superDepartmentName
|
||||
departmentName
|
||||
aisleName
|
||||
shelfName
|
||||
price {
|
||||
actual
|
||||
unitPrice
|
||||
unitOfMeasure
|
||||
}
|
||||
details {
|
||||
ingredients
|
||||
netContents
|
||||
packSize {
|
||||
value
|
||||
units
|
||||
}
|
||||
nutrition {
|
||||
name
|
||||
perComp: value1
|
||||
perServing: value2
|
||||
referenceIntake: value3
|
||||
referencePercentage: value4
|
||||
}
|
||||
guidelineDailyAmount {
|
||||
title
|
||||
dailyAmounts {
|
||||
name
|
||||
value
|
||||
percent
|
||||
rating
|
||||
}
|
||||
}
|
||||
productMarketing
|
||||
preparationAndUsage
|
||||
storage
|
||||
features
|
||||
healthClaims
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
async def graphql(session, query, variables):
|
||||
while True:
|
||||
response = await session.post(
|
||||
XAPI_URL,
|
||||
json={"query": query, "variables": variables},
|
||||
)
|
||||
if response.status_code != 504:
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
if payload.get("errors"):
|
||||
print(json.dumps(payload["errors"], indent=2))
|
||||
else:
|
||||
return payload["data"]
|
||||
|
||||
|
||||
async def search_page(session, term, page, count):
|
||||
data = await graphql(
|
||||
session,
|
||||
SEARCH_QUERY,
|
||||
{"query": term, "page": page, "count": count, "sortBy": "price-ascending"},
|
||||
)
|
||||
nodes = []
|
||||
for result in data["search"]["results"]:
|
||||
node = result.get("node")
|
||||
if node and node.get("tpnc"):
|
||||
nodes.append(node)
|
||||
return nodes
|
||||
|
||||
|
||||
async def category_page(session, cat_id, page, count):
|
||||
data = await graphql(
|
||||
session,
|
||||
CATEGORY_QUERY,
|
||||
{"facet": cat_id, "page": page, "count": count, "sortBy": "price-ascending"},
|
||||
)
|
||||
category = data["category"]
|
||||
nodes = []
|
||||
for result in category["results"]:
|
||||
node = result.get("node")
|
||||
if node and node.get("tpnc"):
|
||||
nodes.append(node)
|
||||
return category["info"], nodes
|
||||
|
||||
|
||||
async def get_taxonomy(session):
|
||||
data = await graphql(
|
||||
session,
|
||||
TAXONOMY_QUERY,
|
||||
{"includeInspirationEvents": False, "configs": []},
|
||||
)
|
||||
return data["taxonomy"]
|
||||
|
||||
|
||||
def flatten_taxonomy(nodes, path=()):
|
||||
for node in nodes:
|
||||
current_path = (*path, node["name"])
|
||||
yield {
|
||||
"catId": node["catId"],
|
||||
"name": node["name"],
|
||||
"label": node["label"],
|
||||
"path": " > ".join(current_path),
|
||||
}
|
||||
yield from flatten_taxonomy(node.get("children") or [], current_path)
|
||||
|
||||
|
||||
def category_filter(categories, args):
|
||||
selected = categories
|
||||
if args.category_label:
|
||||
selected = [c for c in selected if c["label"] in args.category_label]
|
||||
if args.category_contains:
|
||||
needles = [needle.lower() for needle in args.category_contains]
|
||||
selected = [
|
||||
c for c in selected
|
||||
if any(needle in c["path"].lower() for needle in needles)
|
||||
]
|
||||
return selected
|
||||
|
||||
|
||||
async def get_product(session, tpnc):
|
||||
data = await graphql(session, PRODUCT_QUERY, {"tpnc": tpnc})
|
||||
return data["product"]
|
||||
|
||||
|
||||
async def bounded_gather(limit, items, fn):
|
||||
semaphore = asyncio.Semaphore(limit)
|
||||
|
||||
async def run(item):
|
||||
async with semaphore:
|
||||
return await fn(item)
|
||||
|
||||
return await asyncio.gather(*(run(item) for item in items))
|
||||
|
||||
|
||||
async def scrape(args):
|
||||
seen = set()
|
||||
|
||||
async with AsyncSession(headers=HEADERS, impersonate="chrome120", timeout=30) as session:
|
||||
with open(args.output, "w") as output:
|
||||
if args.categories:
|
||||
taxonomy = await get_taxonomy(session)
|
||||
categories = category_filter(list(flatten_taxonomy(taxonomy)), args)
|
||||
print(f"{len(categories)} categories selected", file=sys.stderr)
|
||||
|
||||
for category in categories:
|
||||
print(f"category {category['path']!r}", file=sys.stderr)
|
||||
for page in range(1, args.pages + 1):
|
||||
info, rows = await category_page(
|
||||
session, category["catId"], page, args.count
|
||||
)
|
||||
if not rows:
|
||||
break
|
||||
|
||||
rows = [row for row in rows if row["tpnc"] not in seen]
|
||||
for row in rows:
|
||||
seen.add(row["tpnc"])
|
||||
|
||||
products = await bounded_gather(
|
||||
args.concurrency,
|
||||
rows,
|
||||
lambda row: get_product(session, row["tpnc"]),
|
||||
)
|
||||
|
||||
for product in products:
|
||||
product["matchedCategory"] = category
|
||||
json.dump(product, output, ensure_ascii=False)
|
||||
output.write("\n")
|
||||
|
||||
output.flush()
|
||||
print(
|
||||
f" page {page}: {len(products)} new products "
|
||||
f"({info['total']} listed)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
if info["offset"] + info["count"] >= info["total"]:
|
||||
break
|
||||
return
|
||||
|
||||
terms = args.term or DEFAULT_TERMS
|
||||
for term in terms:
|
||||
print(f"search {term!r}", file=sys.stderr)
|
||||
for page in range(1, args.pages + 1):
|
||||
rows = await search_page(session, term, page, args.count)
|
||||
if not rows:
|
||||
break
|
||||
|
||||
rows = [row for row in rows if row["tpnc"] not in seen]
|
||||
for row in rows:
|
||||
seen.add(row["tpnc"])
|
||||
|
||||
products = await bounded_gather(
|
||||
args.concurrency,
|
||||
rows,
|
||||
lambda row: get_product(session, row["tpnc"]),
|
||||
)
|
||||
|
||||
for product in products:
|
||||
product["matchedSearchTerm"] = term
|
||||
json.dump(product, output, ensure_ascii=False)
|
||||
output.write("\n")
|
||||
|
||||
output.flush()
|
||||
print(
|
||||
f" page {page}: {len(products)} new products",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
if len(rows) < args.count:
|
||||
break
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Scrape Tesco-owned price and nutrition data through Tesco XAPI."
|
||||
)
|
||||
parser.add_argument("term", nargs="*", help="Search terms to scrape")
|
||||
parser.add_argument("-o", "--output", default="tesco.jsonl")
|
||||
parser.add_argument(
|
||||
"--categories",
|
||||
action="store_true",
|
||||
help="Scrape Tesco taxonomy categories instead of keyword search.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--category-label",
|
||||
action="append",
|
||||
choices=["superDepartment", "department", "aisle", "shelf"],
|
||||
help="Restrict --categories to a taxonomy level. Repeatable.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--category-contains",
|
||||
action="append",
|
||||
help="Restrict --categories to paths containing this text. Repeatable.",
|
||||
)
|
||||
parser.add_argument("--pages", type=int, default=2)
|
||||
parser.add_argument("--count", type=int, default=48)
|
||||
parser.add_argument("--concurrency", type=int, default=8)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(scrape(parse_args()))
|
||||
Reference in New Issue
Block a user