# phase4-sites.yaml — ENG-698 Phase 4 real-site coverage list
# =============================================================================
# Curated set of real third-party sites the Phase 4 statistical runner loads
# under a WFP-filtered Chrome to measure tracker-blocking recall/precision
# and host-URL resolution accuracy.
#
# This is consumed by Adblock.ReleaseTests' BlockingRecorder.RunRealSitesAsync
# via the Worker's `/phase4-sites` route (this YAML, parsed to JSON). It is NOT
# a hosted test page — the runner navigates Chrome to each `url` directly.
#
# -----------------------------------------------------------------------------
# FULL LIST (version 2) — ~215 sites.
# - Top US-traffic ad-supported sites (excluding adult content), bucketed
#   across the five categories.
# - Top 20 news sites in UK, CA, AU, FR, DE, IT (all bucketed as `news`;
#   the regional grouping is comment-organisation only — the schema has no
#   country field, and downstream aggregation is purely per-category).
#
# Curation methodology:
#   1. Seed from public top-traffic rankings (Tranco research-grade list +
#      country-specific SimilarWeb buckets); ad-supported sites only.
#   2. Exclude: adult content, sites with no third-party ad inventory
#      (banks, .gov/.edu, paywalled-only login walls), streaming services
#      that require auth to surface ads.
#   3. Cross-reference ad-heavy domains named in support tickets.
#   4. Add 5-10 deliberate worst-offender sentinels with a high
#      `expected_min_tracker_count` so a silent classifier/proxy regression
#      trips an obvious floor (CNN, Fox, Forbes, BI, BuzzFeed, NY Post,
#      Daily Mail, Bild, Corriere).
#
# Fields:
#   url                        (required) absolute https URL.
#   category                   (required) one of: news | ecommerce | video |
#                              social | forums.
#   expected_min_tracker_count (optional) quality floor. A reachable site that
#                              loads fewer classifier-identified trackers than
#                              this is flagged `suspect` (still aggregated).
#
# Operational notes:
# - Per-site flakiness is expected on a real-site list (5xx, CAPTCHAs,
#   geo-blocks). RealSiteMetrics.ComputeRun excludes unreachable from the
#   aggregation; sites that consistently flake should be replaced rather
#   than special-cased.
# - The runner is in Germany (Hetzner CI). Some sites geo-redirect or geo-
#   skin their ad inventory; the host-URL accuracy result is unaffected
#   (the resolution algorithm is geo-agnostic) but tracker counts may
#   differ from a US-VPS run.
#
# Owner: @jcarlucci. Review cadence: quarterly.
# =============================================================================

version: 2

sites:

  # ===========================================================================
  # US — top-traffic ad-supported sites (~95 entries across 5 categories)
  # ===========================================================================

  # --- US news (35) ----------------------------------------------------------
  # Sentinels: CNN, Fox, Business Insider, Forbes, BuzzFeed, NY Post.
  - url: https://www.cnn.com/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.foxnews.com/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.nytimes.com/
    category: news
  - url: https://www.washingtonpost.com/
    category: news
  - url: https://www.usatoday.com/
    category: news
  - url: https://www.wsj.com/
    category: news
  - url: https://www.nbcnews.com/
    category: news
  - url: https://abcnews.go.com/
    category: news
  - url: https://www.cbsnews.com/
    category: news
  - url: https://www.npr.org/
    category: news
  - url: https://www.reuters.com/
    category: news
  - url: https://apnews.com/
    category: news
  - url: https://www.bloomberg.com/
    category: news
  - url: https://www.businessinsider.com/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.forbes.com/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.cnbc.com/
    category: news
  - url: https://www.huffpost.com/
    category: news
  - url: https://www.theverge.com/
    category: news
  - url: https://techcrunch.com/
    category: news
  - url: https://mashable.com/
    category: news
  - url: https://www.buzzfeed.com/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.vice.com/
    category: news
  - url: https://gizmodo.com/
    category: news
  - url: https://arstechnica.com/
    category: news
  - url: https://www.engadget.com/
    category: news
  - url: https://www.vox.com/
    category: news
  - url: https://www.axios.com/
    category: news
  - url: https://www.politico.com/
    category: news
  - url: https://thehill.com/
    category: news
  - url: https://nypost.com/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.dailywire.com/
    category: news
  - url: https://www.breitbart.com/
    category: news
  - url: https://www.theatlantic.com/
    category: news
  - url: https://www.mediaite.com/
    category: news
  - url: https://www.newsweek.com/
    category: news

  # --- US ecommerce (20) -----------------------------------------------------
  - url: https://www.amazon.com/
    category: ecommerce
  - url: https://www.ebay.com/
    category: ecommerce
  - url: https://www.etsy.com/
    category: ecommerce
  - url: https://www.walmart.com/
    category: ecommerce
  - url: https://www.target.com/
    category: ecommerce
  - url: https://www.bestbuy.com/
    category: ecommerce
  - url: https://www.costco.com/
    category: ecommerce
  - url: https://www.homedepot.com/
    category: ecommerce
  - url: https://www.lowes.com/
    category: ecommerce
  - url: https://www.macys.com/
    category: ecommerce
  - url: https://www.kohls.com/
    category: ecommerce
  - url: https://www.nordstrom.com/
    category: ecommerce
  - url: https://www.wayfair.com/
    category: ecommerce
  - url: https://www.samsclub.com/
    category: ecommerce
  - url: https://www.overstock.com/
    category: ecommerce
  - url: https://www.newegg.com/
    category: ecommerce
  - url: https://www.aliexpress.com/
    category: ecommerce
  - url: https://www.shein.com/
    category: ecommerce
  - url: https://www.temu.com/
    category: ecommerce
  - url: https://www.wish.com/
    category: ecommerce

  # --- US video (10) ---------------------------------------------------------
  # Streaming services that require auth to surface ads (Netflix, Disney+ etc.)
  # are deliberately excluded — they fail-fast on the marketing page or hit a
  # paywall that has no useful tracker inventory.
  - url: https://www.youtube.com/
    category: video
  - url: https://www.twitch.tv/
    category: video
  - url: https://www.dailymotion.com/
    category: video
  - url: https://vimeo.com/
    category: video
  - url: https://www.tubitv.com/
    category: video
  - url: https://pluto.tv/
    category: video
  - url: https://rumble.com/
    category: video
  - url: https://www.crunchyroll.com/
    category: video
  - url: https://www.peacocktv.com/
    category: video
  - url: https://www.hulu.com/
    category: video

  # --- US social (15) --------------------------------------------------------
  - url: https://www.facebook.com/
    category: social
  - url: https://www.instagram.com/
    category: social
  - url: https://x.com/
    category: social
  - url: https://www.linkedin.com/
    category: social
  - url: https://www.tiktok.com/
    category: social
  - url: https://www.snapchat.com/
    category: social
  - url: https://www.threads.net/
    category: social
  - url: https://www.pinterest.com/
    category: social
  - url: https://www.tumblr.com/
    category: social
  - url: https://nextdoor.com/
    category: social
  - url: https://medium.com/
    category: social
  - url: https://www.flickr.com/
    category: social
  - url: https://mastodon.social/
    category: social
  - url: https://bsky.app/
    category: social
  - url: https://vk.com/
    category: social

  # --- US forums (15) --------------------------------------------------------
  # Tech-news sites with prominent forum sections (Tom's Hardware, Neowin,
  # BleepingComputer) live here rather than `news` because their inventory
  # shape matches forum-style sites more closely.
  - url: https://www.reddit.com/
    category: forums
  - url: https://www.quora.com/
    category: forums
  - url: https://stackoverflow.com/
    category: forums
  - url: https://stackexchange.com/
    category: forums
  - url: https://www.4chan.org/
    category: forums
  - url: https://slashdot.org/
    category: forums
  - url: https://www.fark.com/
    category: forums
  - url: https://forums.somethingawful.com/
    category: forums
  - url: https://www.xda-developers.com/
    category: forums
  - url: https://www.resetera.com/
    category: forums
  - url: https://www.neowin.net/
    category: forums
  - url: https://www.tomshardware.com/
    category: forums
  - url: https://www.tomsguide.com/
    category: forums
  - url: https://www.bleepingcomputer.com/
    category: forums
  - url: https://news.ycombinator.com/
    category: forums

  # ===========================================================================
  # Per-country news — top 20 each in UK / CA / AU / FR / DE / IT (~120 entries)
  # All bucketed `news`; the regional grouping is comment-organisation only.
  # ===========================================================================

  # --- UK news (20) ----------------------------------------------------------
  # Daily Mail is a deliberate sentinel — tabloid-style, very tracker-heavy.
  - url: https://www.bbc.co.uk/
    category: news
  - url: https://www.theguardian.com/
    category: news
  - url: https://www.dailymail.co.uk/
    category: news
    expected_min_tracker_count: 20
  - url: https://www.thetimes.co.uk/
    category: news
  - url: https://www.telegraph.co.uk/
    category: news
  - url: https://www.independent.co.uk/
    category: news
  - url: https://www.mirror.co.uk/
    category: news
  - url: https://www.thesun.co.uk/
    category: news
  - url: https://www.ft.com/
    category: news
  - url: https://www.economist.com/
    category: news
  - url: https://news.sky.com/
    category: news
  - url: https://metro.co.uk/
    category: news
  - url: https://www.express.co.uk/
    category: news
  - url: https://www.itv.com/news
    category: news
  - url: https://www.channel4.com/news
    category: news
  - url: https://www.standard.co.uk/
    category: news
  - url: https://www.dailyrecord.co.uk/
    category: news
  - url: https://www.heraldscotland.com/
    category: news
  - url: https://www.spectator.co.uk/
    category: news
  - url: https://www.newstatesman.com/
    category: news

  # --- CA news (20) ----------------------------------------------------------
  - url: https://www.cbc.ca/
    category: news
  - url: https://www.theglobeandmail.com/
    category: news
  - url: https://nationalpost.com/
    category: news
  - url: https://www.thestar.com/
    category: news
  - url: https://www.ctvnews.ca/
    category: news
  - url: https://globalnews.ca/
    category: news
  - url: https://www.lapresse.ca/
    category: news
  - url: https://www.ledevoir.com/
    category: news
  - url: https://www.journaldemontreal.com/
    category: news
  - url: https://www.tvanouvelles.ca/
    category: news
  - url: https://ici.radio-canada.ca/
    category: news
  - url: https://www.macleans.ca/
    category: news
  - url: https://www.cp24.com/
    category: news
  - url: https://toronto.citynews.ca/
    category: news
  - url: https://www.winnipegfreepress.com/
    category: news
  - url: https://ottawacitizen.com/
    category: news
  - url: https://calgaryherald.com/
    category: news
  - url: https://vancouversun.com/
    category: news
  - url: https://theprovince.com/
    category: news
  - url: https://www.timescolonist.com/
    category: news

  # --- AU news (20) ----------------------------------------------------------
  - url: https://www.abc.net.au/news
    category: news
  - url: https://www.smh.com.au/
    category: news
  - url: https://www.theaustralian.com.au/
    category: news
  - url: https://www.news.com.au/
    category: news
  - url: https://www.9news.com.au/
    category: news
  - url: https://7news.com.au/
    category: news
  - url: https://www.heraldsun.com.au/
    category: news
  - url: https://www.dailytelegraph.com.au/
    category: news
  - url: https://www.theage.com.au/
    category: news
  - url: https://www.couriermail.com.au/
    category: news
  - url: https://www.afr.com/
    category: news
  - url: https://www.perthnow.com.au/
    category: news
  - url: https://www.crikey.com.au/
    category: news
  - url: https://theconversation.com/au
    category: news
  - url: https://www.sbs.com.au/news
    category: news
  - url: https://www.brisbanetimes.com.au/
    category: news
  - url: https://www.canberratimes.com.au/
    category: news
  - url: https://www.adelaidenow.com.au/
    category: news
  - url: https://thewest.com.au/
    category: news
  - url: https://www.skynews.com.au/
    category: news

  # --- FR news (20) ----------------------------------------------------------
  - url: https://www.lemonde.fr/
    category: news
  - url: https://www.lefigaro.fr/
    category: news
  - url: https://www.liberation.fr/
    category: news
  - url: https://www.leparisien.fr/
    category: news
  - url: https://www.20minutes.fr/
    category: news
  - url: https://www.lesechos.fr/
    category: news
  - url: https://www.lepoint.fr/
    category: news
  - url: https://www.lexpress.fr/
    category: news
  - url: https://www.nouvelobs.com/
    category: news
  - url: https://www.mediapart.fr/
    category: news
  - url: https://www.francetvinfo.fr/
    category: news
  - url: https://www.bfmtv.com/
    category: news
  - url: https://www.tf1info.fr/
    category: news
  - url: https://www.rfi.fr/fr/
    category: news
  - url: https://www.france24.com/fr/
    category: news
  - url: https://www.ouest-france.fr/
    category: news
  - url: https://www.la-croix.com/
    category: news
  - url: https://www.humanite.fr/
    category: news
  - url: https://www.valeursactuelles.com/
    category: news
  - url: https://www.leprogres.fr/
    category: news

  # --- DE news (20) ----------------------------------------------------------
  # Bild is a deliberate sentinel — German tabloid, very tracker-heavy.
  - url: https://www.spiegel.de/
    category: news
  - url: https://www.bild.de/
    category: news
    expected_min_tracker_count: 25
  - url: https://www.zeit.de/
    category: news
  - url: https://www.faz.net/
    category: news
  - url: https://www.welt.de/
    category: news
  - url: https://www.sueddeutsche.de/
    category: news
  - url: https://www.stern.de/
    category: news
  - url: https://www.focus.de/
    category: news
  - url: https://www.n-tv.de/
    category: news
  - url: https://www.tagesschau.de/
    category: news
  - url: https://www.tagesspiegel.de/
    category: news
  - url: https://www.handelsblatt.com/
    category: news
  - url: https://www.wiwo.de/
    category: news
  - url: https://taz.de/
    category: news
  - url: https://www.heise.de/
    category: news
  - url: https://www.golem.de/
    category: news
  - url: https://www.t-online.de/
    category: news
  - url: https://www.rtl.de/
    category: news
  - url: https://www1.wdr.de/
    category: news
  - url: https://www.ndr.de/
    category: news

  # --- IT news (20) ----------------------------------------------------------
  - url: https://www.corriere.it/
    category: news
    expected_min_tracker_count: 15
  - url: https://www.repubblica.it/
    category: news
  - url: https://www.lastampa.it/
    category: news
  - url: https://www.ilsole24ore.com/
    category: news
  - url: https://www.ilfattoquotidiano.it/
    category: news
  - url: https://www.gazzetta.it/
    category: news
  - url: https://www.ansa.it/
    category: news
  - url: https://www.tgcom24.mediaset.it/
    category: news
  - url: https://www.ilmessaggero.it/
    category: news
  - url: https://www.ilgiornale.it/
    category: news
  - url: https://www.ilmattino.it/
    category: news
  - url: https://www.liberoquotidiano.it/
    category: news
  - url: https://www.corrieredellosport.it/
    category: news
  - url: https://www.rainews.it/
    category: news
  - url: https://www.avvenire.it/
    category: news
  - url: https://www.open.online/
    category: news
  - url: https://www.ilpost.it/
    category: news
  - url: https://www.fanpage.it/
    category: news
  - url: https://tg24.sky.it/
    category: news
  - url: https://www.huffingtonpost.it/
    category: news
