import argparse import asyncio import json import sys from curl_cffi import AsyncSession XAPI_URL = "https://xapi.tesco.com/" # This key is shipped to browsers in Tesco's own groceries page config. XAPI_KEY = "TvOSZJHlEk0pjniDGQFAc9Q59WGAR4dA" HEADERS = { "accept": "application/json", "content-type": "application/json", "origin": "https://www.tesco.com", "referer": "https://www.tesco.com/groceries/en-GB/search?consumer=ghsapp-uk", "x-apikey": XAPI_KEY, } DEFAULT_TERMS = [ "bread", "butter", "cheese", "egg", "flour", "milk", "oats", "pasta", "potato", "rice", "tomato", "vegetable", ] TAXONOMY_QUERY = """ query Taxonomy($includeInspirationEvents: Boolean = false, $configs: [ConfigArgType]) { taxonomy(includeInspirationEvents: $includeInspirationEvents, configs: $configs) { catId: id name label parent children { catId: id name label parent children { catId: id name label parent children { catId: id name label parent } } } } } """ SEARCH_QUERY = """ query Search($query: String!, $page: Int, $count: Int, $sortBy: String) { search(query: $query, page: $page, count: $count, sortBy: $sortBy) { results { node { ... on ProductInterface { id tpnb tpnc gtin title brandName superDepartmentName departmentName aisleName shelfName price { actual unitPrice unitOfMeasure } } } } } } """ CATEGORY_QUERY = """ query Category($facet: ID, $page: Int, $count: Int, $sortBy: String) { category(facet: $facet, page: $page, count: $count, sortBy: $sortBy) { info { total page count pageSize offset } results { node { ... on ProductInterface { id tpnb tpnc gtin title brandName superDepartmentName departmentName aisleName shelfName price { actual unitPrice unitOfMeasure } } } } } } """ PRODUCT_QUERY = """ query GetProduct($tpnc: String) { product(tpnc: $tpnc) { id tpnb tpnc gtin title brandName description foodIcons superDepartmentName departmentName aisleName shelfName price { actual unitPrice unitOfMeasure } details { ingredients netContents packSize { value units } nutrition { name perComp: value1 perServing: value2 referenceIntake: value3 referencePercentage: value4 } guidelineDailyAmount { title dailyAmounts { name value percent rating } } productMarketing preparationAndUsage storage features healthClaims } } } """ async def graphql(session, query, variables): while True: response = await session.post( XAPI_URL, json={"query": query, "variables": variables}, ) if response.status_code != 504: response.raise_for_status() payload = response.json() if payload.get("errors"): print(json.dumps(payload["errors"], indent=2)) else: return payload["data"] async def search_page(session, term, page, count): data = await graphql( session, SEARCH_QUERY, {"query": term, "page": page, "count": count, "sortBy": "price-ascending"}, ) nodes = [] for result in data["search"]["results"]: node = result.get("node") if node and node.get("tpnc"): nodes.append(node) return nodes async def category_page(session, cat_id, page, count): data = await graphql( session, CATEGORY_QUERY, {"facet": cat_id, "page": page, "count": count, "sortBy": "price-ascending"}, ) category = data["category"] nodes = [] for result in category["results"]: node = result.get("node") if node and node.get("tpnc"): nodes.append(node) return category["info"], nodes async def get_taxonomy(session): data = await graphql( session, TAXONOMY_QUERY, {"includeInspirationEvents": False, "configs": []}, ) return data["taxonomy"] def flatten_taxonomy(nodes, path=()): for node in nodes: current_path = (*path, node["name"]) yield { "catId": node["catId"], "name": node["name"], "label": node["label"], "path": " > ".join(current_path), } yield from flatten_taxonomy(node.get("children") or [], current_path) def category_filter(categories, args): selected = categories if args.category_label: selected = [c for c in selected if c["label"] in args.category_label] if args.category_contains: needles = [needle.lower() for needle in args.category_contains] selected = [ c for c in selected if any(needle in c["path"].lower() for needle in needles) ] return selected async def get_product(session, tpnc): data = await graphql(session, PRODUCT_QUERY, {"tpnc": tpnc}) return data["product"] async def bounded_gather(limit, items, fn): semaphore = asyncio.Semaphore(limit) async def run(item): async with semaphore: return await fn(item) return await asyncio.gather(*(run(item) for item in items)) async def scrape(args): seen = set() async with AsyncSession(headers=HEADERS, impersonate="chrome120", timeout=30) as session: with open(args.output, "w") as output: if args.categories: taxonomy = await get_taxonomy(session) categories = category_filter(list(flatten_taxonomy(taxonomy)), args) print(f"{len(categories)} categories selected", file=sys.stderr) for category in categories: print(f"category {category['path']!r}", file=sys.stderr) for page in range(1, args.pages + 1): info, rows = await category_page( session, category["catId"], page, args.count ) if not rows: break rows = [row for row in rows if row["tpnc"] not in seen] for row in rows: seen.add(row["tpnc"]) products = await bounded_gather( args.concurrency, rows, lambda row: get_product(session, row["tpnc"]), ) for product in products: product["matchedCategory"] = category json.dump(product, output, ensure_ascii=False) output.write("\n") output.flush() print( f" page {page}: {len(products)} new products " f"({info['total']} listed)", file=sys.stderr, ) if info["offset"] + info["count"] >= info["total"]: break return terms = args.term or DEFAULT_TERMS for term in terms: print(f"search {term!r}", file=sys.stderr) for page in range(1, args.pages + 1): rows = await search_page(session, term, page, args.count) if not rows: break rows = [row for row in rows if row["tpnc"] not in seen] for row in rows: seen.add(row["tpnc"]) products = await bounded_gather( args.concurrency, rows, lambda row: get_product(session, row["tpnc"]), ) for product in products: product["matchedSearchTerm"] = term json.dump(product, output, ensure_ascii=False) output.write("\n") output.flush() print( f" page {page}: {len(products)} new products", file=sys.stderr, ) if len(rows) < args.count: break def parse_args(): parser = argparse.ArgumentParser( description="Scrape Tesco-owned price and nutrition data through Tesco XAPI." ) parser.add_argument("term", nargs="*", help="Search terms to scrape") parser.add_argument("-o", "--output", default="tesco.jsonl") parser.add_argument( "--categories", action="store_true", help="Scrape Tesco taxonomy categories instead of keyword search.", ) parser.add_argument( "--category-label", action="append", choices=["superDepartment", "department", "aisle", "shelf"], help="Restrict --categories to a taxonomy level. Repeatable.", ) parser.add_argument( "--category-contains", action="append", help="Restrict --categories to paths containing this text. Repeatable.", ) parser.add_argument("--pages", type=int, default=2) parser.add_argument("--count", type=int, default=48) parser.add_argument("--concurrency", type=int, default=8) return parser.parse_args() if __name__ == "__main__": asyncio.run(scrape(parse_args()))