diff --git a/domain_classifier/classifier.py b/domain_classifier/classifier.py index 5ea9a11..4d03c3b 100644 --- a/domain_classifier/classifier.py +++ b/domain_classifier/classifier.py @@ -7,10 +7,9 @@ import time from typing import Optional -from urllib.parse import urlparse - from domain_classifier.config import settings from domain_classifier.models.domain import DomainDetails, DomainResult +from domain_classifier.parking import is_parking_redirect from domain_classifier.pipeline.content import analyze_content from domain_classifier.pipeline.dns_check import check_dns from domain_classifier.pipeline.fetcher import fetch @@ -116,47 +115,6 @@ def _is_government_domain(domain: str) -> bool: return False -# Domains that serve parking / for-sale landers — redirecting to these means -# the original domain is parked, not simply forwarded to live content. -_PARKING_REDIRECT_HOSTS = { - "forsale.godaddy.com", "godaddy.com", - "sedo.com", "sedoparking.com", - "dan.com", "afternic.com", - "hugedomains.com", "domainmarket.com", - "parkingcrew.net", "bodis.com", - "above.com", "epik.com", - "undeveloped.com", "efty.com", - "buydomains.com", "domcop.com", - "squadhelp.com", - # Content farm / affiliate parking networks - "searchhounds.com", "dot-software.org", "dot-film.org", "dot-loans.org", - # Affiliate monetization — domains redirected here are monetized abandoned/expired - "gocomper.com", - # Expired domain marketplaces - "expireddomains.com", - # Traffic arbitrage / search redirect networks - "resultlookup.com", "onlineresultfinder.com", - "trkflow.xyz", - # Registrar parking (Network Solutions) - "networksolutions.com", -} - - -def _is_parking_redirect(final_url: str) -> bool: - """Return True if the redirect destination is a known parking/for-sale provider.""" - parsed = urlparse(final_url) - host = parsed.netloc.lower().removeprefix("www.") - if host in _PARKING_REDIRECT_HOSTS or any(host.endswith("." + p) for p in _PARKING_REDIRECT_HOSTS): - return True - # Content farm affiliate network signature: ?psystem=XX&domain=&oref=... - # This URL pattern is used by multiple dot-* domains and partner networks to - # redirect expired/parked domains through SEO article landers. - q = parsed.query - if "psystem=" in q and "domain=" in q and "oref=" in q: - return True - return False - - async def classify_domain(domain: str) -> DomainResult: """Run the full classification pipeline for a single domain.""" domain = domain.strip().lower().removeprefix("https://").removeprefix("http://").split("/")[0] @@ -239,7 +197,7 @@ async def classify_domain(domain: str) -> DomainResult: if fetch_result.redirected_to_other_domain: details.redirects_to = fetch_result.final_url - if _is_parking_redirect(fetch_result.final_url or ""): + if is_parking_redirect(fetch_result.final_url or ""): result.classification = "parked" result.classification_score = 1.0 result.grade = assign_grade("parked", 1.0) diff --git a/domain_classifier/parking.py b/domain_classifier/parking.py new file mode 100644 index 0000000..dfa023b --- /dev/null +++ b/domain_classifier/parking.py @@ -0,0 +1,226 @@ +""" +Parking domain detection — reference data and redirect classification. + +PARKING_REDIRECT_HOSTS is the canonical set of domains that serve parking or +for-sale landers. If a domain's HTTP fetch resolves to one of these hosts +(exact match or subdomain), the source domain is classified as parked rather +than a live redirect. + +Organised by category so additions are easy to place correctly. +""" +from __future__ import annotations + +from urllib.parse import urlparse + + +# --------------------------------------------------------------------------- +# Reference data — parking / for-sale redirect destinations +# --------------------------------------------------------------------------- + +# Domain for-sale marketplaces — broker or auction sites where expired/listed +# domains land when put up for sale. +_FOR_SALE_MARKETPLACES = { + "dan.com", # Formerly undeveloped.com auction arm; now part of GoDaddy + "afternic.com", # GoDaddy premium marketplace + "sedo.com", # Major international marketplace + "hugedomains.com", # Large portfolio seller + "domainmarket.com", + "efty.com", # Marketplace platform + "buydomains.com", + "domcop.com", # Expired domain marketplace + "squadhelp.com", # Brandable name marketplace + "expireddomains.com", + "flippa.com", # Domain + website marketplace + "saw.com", # Sedo Afternic Wholesale + "sav.com", # Domain marketplace / parking hybrid + "odys.co", # Aged domain marketplace (was undeveloped.com) + "brandpa.com", # Brandable domain marketplace + "greatdomains.com", # Premium domain sales + "atom.com", # Short domain marketplace + "epik.com", # Registrar with active for-sale marketplace + "undeveloped.com", # Legacy URL; kept for older redirects + "perfectdomain.com", # Domain for-sale marketplace + "uniregistry.com", # Frank Schilling marketplace; acquired by GoDaddy + "brandbucket.com", # Brandable domain marketplace + "aftermarket.pl", # Polish domain marketplace + "domainmarkt.de", # German domain marketplace + "domainname.ru", # Russian domain marketplace + "com.com", # Domains redirected here are listed for sale + "sawbrokers.com", # Domain broker + "venture.com", # Domain portfolio / parking + "domainworld.org", # Domain marketplace + "directdomains.com", # Domain broker + "dropcatch.com", # Expired domain auction / catch service + "dynadot.com", # Registrar with active parking/for-sale lander + "uk.com", # UK domain reseller; parked domains land here +} + +# Registrar-operated parking pages — the registrar's own placeholder/for-sale +# lander shown when a domain resolves but has no real content. +_REGISTRAR_PARKING = { + "forsale.godaddy.com", # GoDaddy for-sale lander (subdomain) + "godaddy.com", # GoDaddy generic parking fallback + "secureserver.net", # GoDaddy / Wild West Domains hosting network; parked lander + "myregisteredsite.com", # GoDaddy "coming soon" / placeholder page + "sedoparking.com", # Sedo parking CDN + "parkingcrew.net", # GoDaddy/Afternic parking network + "networksolutions.com", # Web.com / Network Solutions registrar parking + "parked.com", # Generic parking provider + "parking.com", # Generic parking provider + "domainparking.com", # Generic parking provider + "reg.ru", # Russian registrar; parked domains redirect here + "expiredwixdomain.com", # Wix expired/cancelled domain lander + "homestead.com", # Expired Homestead website builder domains land here + "hostgator.com", # Suspended HostGator accounts redirect here + "freenom.com", # Free domain registrar; expired domains return here + "affordablewebhosting.com", # Hosting placeholder / parking + "goneo.de", # German hosting with parked domain landers + "website-start.de", # German hosting placeholder page + "netsons.com", # Italian hosting with parking landers + "home.pl", # Polish registrar; parked domains redirect here +} + +# Parking monetisation networks — ad-revenue sharing platforms that serve PPC +# landers on parked domains. +_PARKING_MONETISATION = { + "bodis.com", # Domain monetisation network + "above.com", # Domain monetisation network + "domainsponsor.com", # Established PPC parking network + "cashparking.com", # PPC parking + "trellian.com", # Domain advertising / parking + "smartname.com", # Parking monetisation + "parklogic.com", # Parking yield optimiser + "namedrive.com", # Domain parking provider + "whypark.com", # Self-serve parking platform + "parkingpanther.com", # Parking provider + "dpml.com", # Donuts Premium Monetisation Layer + "domainzaar.com", # Domain monetisation + "simcast.com", # Domain parking service + "verifymywhois.com", # WHOIS verification / parking placeholder +} + +# Traffic arbitrage and search redirect networks — domains redirected here are +# monetised through click-through ad revenue, not parked in the traditional +# sense but indistinguishable from a user perspective. +_TRAFFIC_ARBITRAGE = { + "resultlookup.com", + "onlineresultfinder.com", + "findresultsonline.com", + "findresultsquick.com", + "bestofbrowsing.com", + "trkflow.xyz", + "trafficvance.com", + "hitfarm.com", # Click-farm traffic arbitrage + "trafficz.com", + "searchhounds.com", + "citygrid.com", # Local business PPC / search redirect + "himado.com", # Search redirect / traffic arbitrage + "perfdrive.com", # Traffic arbitrage + "quotes.com", # Content farm / PPC lander + "googleblog.com", # Domain squatter; redirected domains land on PPC lander +} + +# Content farm affiliate networks — expired domains redirected through these +# networks serve SEO article landers monetised via affiliate commissions. +# Many share the same URL signature: ?psystem=XX&domain=&oref=... +_CONTENT_FARM_AFFILIATES = { + "cdn-fileserver.com", + "gocomper.com", + "dot-software.org", + "dot-film.org", + "dot-loans.org", + "funtimefarms.com", # Local-interest content farm lander + "importantlocalbusinesses.com",# Local business directory content farm + "dealercarsearch.com", # Auto dealer content farm / affiliate lander + "offcarrot.com", # Affiliate / content farm lander + "survey-smiles.com", # Survey affiliate lander + "supportcharity.com", # Charity affiliate lander + "registrar-transfers.com", # Domain transfer placeholder / affiliate + # Low-signal / obfuscated parking network nodes + "xcfss.xyz", + "dnfs24.com", + "haxbyq.com", + "static.uni5.net", + "shbzek.com", + "szqxvo.com", + "asry4eyw2lqk6.com", + "evo-media.eu", + "realtydao.com", + "daaz.com", + "contrib.com", +} + +# --------------------------------------------------------------------------- +# URL-pattern parking rules +# +# Some major brands serve pages that look like acquisitions when a domain +# redirects there, but are actually parking indicators when the URL matches +# a specific pattern. Format: (host, url_substring_that_must_match). +# +# Used by is_parking_redirect() — both the host AND the substring must match. +# Add new entries below; keep them sorted by host for readability. +# --------------------------------------------------------------------------- + +URL_PARKING_PATTERNS: list[tuple[str, str]] = [ + # Google Workspace / G Suite — when a domain's account expires or is + # deprovisioned, HTTP requests are bounced to the Google sign-in page + # with gws_rd=ssl. This is not an acquisition; the domain is dead. + ("accounts.google.com", "gws_rd=ssl"), + + # Servers Australia "help.com.au" — hosting-provider parking page shown + # for expired/unclaimed customer domains. Landing URL includes ?d=. + # Without this pattern the redirect looks like an acquisition signal + # (help.com.au is a live hosting co with its own ultimate), so the ?d= + # marker is the surgical indicator that the source is parked. + ("help.com.au", "?d="), + + # Wix — explicitly deactivated or parked Wix sites. Generic Wix redirects + # are caught as "platform" in redirect_detector; these specific URL patterns + # indicate the domain itself is dead, warranting grade C. + ("wix.com", "parking?"), + ("wix.com", "deactivate-domain"), +] + + +# Canonical union — this is the set used for redirect classification. +PARKING_REDIRECT_HOSTS: frozenset[str] = frozenset( + _FOR_SALE_MARKETPLACES + | _REGISTRAR_PARKING + | _PARKING_MONETISATION + | _TRAFFIC_ARBITRAGE + | _CONTENT_FARM_AFFILIATES +) + + +# --------------------------------------------------------------------------- +# Detection +# --------------------------------------------------------------------------- + +def is_parking_redirect(final_url: str) -> bool: + """Return True if the redirect destination is a known parking or for-sale provider. + + Matches on exact host (after stripping www.) and any subdomain thereof. + Also catches content farm affiliate networks via their shared query-string + signature (?psystem=...&domain=...&oref=...). + """ + parsed = urlparse(final_url) + host = parsed.netloc.lower().removeprefix("www.") + + if host in PARKING_REDIRECT_HOSTS: + return True + if any(host.endswith("." + p) for p in PARKING_REDIRECT_HOSTS): + return True + + # Content farm affiliate signature shared across dot-* domains and partners: + # ?psystem=XX&domain=&oref=... + q = parsed.query + if "psystem=" in q and "domain=" in q and "oref=" in q: + return True + + # URL-pattern rules: major-brand hosts that serve parking/dead-account pages + # at specific URLs. Both the host and the substring must match. + for pattern_host, pattern_substr in URL_PARKING_PATTERNS: + if host == pattern_host or host.endswith("." + pattern_host): + if pattern_substr in final_url: + return True + return False diff --git a/domain_classifier/platforms.py b/domain_classifier/platforms.py new file mode 100644 index 0000000..1df8873 --- /dev/null +++ b/domain_classifier/platforms.py @@ -0,0 +1,329 @@ +""" +Hosting / SaaS platform redirect detection — reference data and helpers. + +These lists describe redirect destinations that indicate the source domain is +hosting on (or has expired from) a platform, NOT that it has been acquired. + +Consumed by downstream redirect classifiers — e.g. the Domain Intelligence +pipeline uses these to distinguish "platform" redirects from genuine +acquisition / brand-consolidation signals before emitting change events. + +Relationship to parking.py: + - A host in PARKING_REDIRECT_HOSTS indicates the source domain is dead or + for-sale (grade C). That signal is stronger than "platform". + - Hosts that are primarily platforms but also see occasional parking traffic + live only in this file; hosts that are primarily parking landers live + only in parking.py. Membership is mutually exclusive by convention — + parking wins on overlap. +""" +from __future__ import annotations + +import re +from typing import Optional +from urllib.parse import unquote, urlparse + + +# --------------------------------------------------------------------------- +# Platform URL patterns +# +# Some domains that ARE legitimate acquirers (Apple, Google, Microsoft, ...) also +# operate content channels and infrastructure subdomains. A redirect to one of +# those specific subdomains is NOT an acquisition signal — it means the source +# domain is a content property of, or was hosted on, that platform. +# +# Format: (exact_host, url_substring_or_None) +# - exact_host : full subdomain to match (e.g. "tv.apple.com") +# - url_substring : if set, treated as a regex (re.search) against the +# URL-decoded URL, case-insensitive; None = match all +# +# Add new entries below; sort by apex domain then subdomain. +# --------------------------------------------------------------------------- + +URL_PLATFORM_PATTERNS: list[tuple[str, Optional[str]]] = [ + # Apple — content distribution channels. Apple News and Apple TV are + # platforms, not acquisition indicators. + ("news.apple.com", None), + ("tv.apple.com", None), + + # Google — user-content platforms. A redirect to sites.google.com, or to + # Docs/Drive/Forms/Groups, means the source domain is using a Google + # consumer product as its website. Not an acquisition by Google. + ("sites.google.com", None), + ("docs.google.com", None), + ("drive.google.com", None), + ("forms.google.com", None), + ("groups.google.com", None), + + # Microsoft — authentication / identity infrastructure. Redirects to the + # M365 / Azure AD login page mean the domain's subscription lapsed; + # they are not acquisitions by Microsoft. + ("login.microsoftonline.com", None), + + # Rackspace — redirects to the Rackspace apps portal indicate the domain + # was using Rackspace for email / cloud services. + ("apps.rackspace.com", None), + + # Carrot (real-estate website builder) — inactive.carrot.com is the + # specific subdomain served when a Carrot customer's account is cancelled. + ("inactive.carrot.com", None), + + # Social media profiles — redirects to user/company profile pages are + # not acquisitions. Uses regex to match specific path segments only so + # that a genuine acquisition announcement (e.g. linkedin.com/pulse/...) is + # not suppressed. + ("linkedin.com", r"/(in|admin|company|pub)/"), + ("facebook.com", None), + + # E-commerce & marketplace seller/store pages. + ("ebay.com", r"/(usr|str|sch)/"), + ("etsy.com", r"/(shop|people)/"), + ("amazon.com", r"/(gp|dp|stores|e|s)/|marketplaceID|merchant="), + + # Real-estate franchise pages — agent profiles and listings. + ("century21.com", r"agent|homesforsale|century-21|real-estate"), + ("remax.com", r"real-estate-agents|offices|real-estate"), + + # Business directories & ratings platforms. + ("angi.com", "companylist"), + ("yelp.com", r"/biz/"), + + # Hospitality chains — individual hotel / property pages. + ("hilton.com", "hotels"), + ("marriott.com", "hotels"), + + # Insurance carrier agent-finder / office-locator pages. + ("agents.farmers.com", None), + ("northwesternmutual.com", "financialadvisor"), + ("erieinsurance.com", "find-an-insurance-agent"), + ("hubinternational.com", "offices"), + + # Media & entertainment — channel / show pages. + ("youtube.com", "channel"), + ("iheart.com", None), + + # Messaging platforms — click-to-chat and API links. + ("chat.whatsapp.com", None), + ("api.whatsapp.com", None), + + # Photography portfolio platforms. + ("smugmug.com", None), + + # Reference / encyclopaedia. + ("wikipedia.org", r"/wiki/"), + + # Veterinary / pet-care chain location pages. + ("thrivepetcare.com", "locations"), + + # Telecom business portals. + ("verizon.com", "business"), + + # Automotive parts recycler directory. + ("car-part.com", "recycler"), + + # Private club networks. + ("invitedclubs.com", "clubs"), + + # Restaurant / food-service platforms — disabled account pages. + ("slicelife.com", "display_disabled"), + ("slicelife.com", "restaurants"), + ("telefloristonline.com", "notactive"), + + # Property / apartment platforms — listing pages, not acquisitions. + ("rentcafe.com", "apartments"), + + # Business directories — local / trade listing pages. + ("yellowpages.ca", "bus"), + + # Cloud / hosting providers — placeholder or lock pages for inactive accounts. + ("aliyun.com", "hosting"), + ("qcloud.com", "weblock"), + + # Blog / social platforms — redirects to generic explore/discover pages + # indicate the custom-domain blog was deleted, not an acquisition. + ("tumblr.com", "explore"), + + # ISP / legacy hosting — placeholder pages for expired hosting accounts. + ("earthlink.net", "internetbusiness-internet"), + + # Brand / manufacturer redirects — product or dealer pages on an OEM site + # are brand consolidations, not acquisitions. + ("volvocars.com", "cars"), + ("volvocars.com", "dealers"), + + # E-commerce platforms — 404 / missing-store pages. + ("alibaba.com", "error404"), +] + + +# --------------------------------------------------------------------------- +# Platform redirect hosts — apex-level +# +# Redirecting to these domains does NOT indicate an acquisition or brand +# consolidation. They are hosting platforms, SaaS builders, registrar +# portals, or ISP default pages — a redirect here means the source domain is +# using (or has expired from) a platform, not that it was bought by one. +# +# Hosts that are primarily parking/for-sale indicators live in parking.py +# (PARKING_REDIRECT_HOSTS); parking wins on overlap. +# --------------------------------------------------------------------------- + +PLATFORM_REDIRECT_HOSTS: frozenset[str] = frozenset({ + # Website / blog builders — expired or suspended accounts land on these. + # (homestead.com is in parking._REGISTRAR_PARKING — parking wins.) + "wordpress.com", + "wix.com", + "squarespace.com", + "weebly.com", + "webflow.com", + "placester.com", # Real-estate agent site builder + "appmaster.io", # No-code app builder + + # Funnel / landing-page platforms + "clickfunnels.com", + + # Link-in-bio / short link services — not acquisition signals + "heylink.me", + "bitly.com", + "linktree.com", + "linktr.ee", # Linktree's short-domain variant + + # Hosting providers — suspended / expired account landing pages + "lolipop.jp", # Japanese managed hosting + "timeweb.ru", # Russian hosting / cloud + "webgo24.de", # German shared hosting + "mybluehost.me", # Bluehost managed WordPress + "gabia.io", # Korean hosting / domain registrar + + # Domain broker / for-sale landers that aren't in parking.py because + # they also front live registrar content. + "domainname.de", + "domain-akquise.de", + + # Large consumer portals — too broad to be acquisition signals + "yahoo.com", + + # Hosting providers — suspended accounts redirect to provider domain + "hawkhost.com", + "ovhcloud.com", + "orange.fr", # French ISP / telecom default page + + # Domain registries / admin pages — not commercial acquirers + "denic.de", # German .de registry + + # B2B / trade directory platforms + "tradeindia.com", + + # Consumer security / VPN landing — surfaces as a false destination for + # dozens of unrelated domains (Cloudflare-fronted, Alibaba-hosted, etc.); + # almost certainly an httpfetch mis-capture, never a real acquisition signal. + "pango-cloud.com", +}) + + +# --------------------------------------------------------------------------- +# Platform hostname suffixes +# +# Matched against the raw final-URL hostname (not the apex). A redirect +# ending on one of these is almost always the source's hosting platform or +# a CDN / edge endpoint — NOT an acquisition. Checked in addition to the +# apex-based PLATFORM_REDIRECT_HOSTS because the PSL treats several of +# these as registrable suffixes, so apex extraction strips below them +# (e.g. "mybucket.s3.amazonaws.com" has apex "mybucket.s3.amazonaws.com", +# not "amazonaws.com"). Leading dot required so that partial-word matches +# are prevented. +# --------------------------------------------------------------------------- + +PLATFORM_HOSTNAME_SUFFIXES: frozenset[str] = frozenset({ + # AWS + ".cloudfront.net", + ".amazonaws.com", + ".elasticbeanstalk.com", + ".awsapps.com", + # Google Cloud / Firebase + ".appspot.com", + ".web.app", + ".firebaseapp.com", + ".googleusercontent.com", + ".run.app", # Cloud Run + ".cloudfunctions.net", + # Microsoft Azure + ".azurewebsites.net", + ".cloudapp.net", + ".cloudapp.azure.com", + ".windows.net", + ".azureedge.net", + # Heroku / Salesforce platform + ".herokuapp.com", + ".herokudns.com", + # Vercel / Netlify / Cloudflare Pages / DO + ".vercel.app", + ".netlify.app", + ".netlify.com", + ".pages.dev", + ".workers.dev", + ".ondigitalocean.app", + # Static-site hosts + ".github.io", + ".gitlab.io", + ".bitbucket.io", + # CDN / edge + ".fastly.net", + ".akamaihd.net", + ".akamaized.net", + ".edgesuite.net", + ".edgekey.net", + # Managed WordPress + ".wpengine.com", + ".wpenginepowered.com", + ".kinsta.cloud", + ".flywheelsites.com", + # Shopify storefront domains + ".myshopify.com", +}) + + +# --------------------------------------------------------------------------- +# Detection helpers +# --------------------------------------------------------------------------- + +def is_platform_url(url: str) -> bool: + """Return True if url matches a known platform subdomain/URL pattern. + + Matching rules: + - host is checked against pattern_host exactly or as a subdomain suffix + - pattern is None → any URL on that host matches + - pattern is a string → treated as a regex (re.search) against the + URL-decoded URL, case-insensitive. Simple literals (e.g. "hotels") + work unchanged; OR patterns use standard regex syntax (e.g. + "agent|homesforsale"). + - URL is decoded before regex matching so %2F-encoded paths compare + cleanly against unencoded patterns. + """ + if not url: + return False + parsed = urlparse(url) + host = parsed.netloc.lower().removeprefix("www.") + url_decoded = unquote(url) + for pattern_host, pattern in URL_PLATFORM_PATTERNS: + if host == pattern_host or host.endswith("." + pattern_host): + if pattern is None: + return True + if re.search(pattern, url_decoded, re.IGNORECASE): + return True + return False + + +def is_platform_host(hostname: str) -> bool: + """Return True if hostname ends with a known platform / CDN suffix. + + Suffixes include the leading dot; a bare apex that matches the suffix + (minus the dot) also counts — e.g. hostname='cloudfront.net' matches + '.cloudfront.net' via the second branch. + """ + if not hostname: + return False + h = hostname.lower().removeprefix("www.") + for suf in PLATFORM_HOSTNAME_SUFFIXES: + if h.endswith(suf) or h == suf[1:]: + return True + return False diff --git a/tests/test_parking.py b/tests/test_parking.py new file mode 100644 index 0000000..06f2afa --- /dev/null +++ b/tests/test_parking.py @@ -0,0 +1,67 @@ +"""Tests for parking/for-sale redirect detection.""" +from __future__ import annotations + +import pytest + +from domain_classifier.parking import ( + PARKING_REDIRECT_HOSTS, + URL_PARKING_PATTERNS, + is_parking_redirect, +) + + +@pytest.mark.parametrize( + "url", + [ + "https://godaddy.com/", + "https://www.godaddy.com/lander", + "https://forsale.godaddy.com/foo", + "https://sedo.com/search/?domain=x.com", + "https://sedoparking.com/x.com", + "https://parkingcrew.net/somepath", + "https://expired.example.com/?psystem=xx&domain=x.com&oref=1", + ], +) +def test_parking_host_matches(url: str) -> None: + assert is_parking_redirect(url) + + +@pytest.mark.parametrize( + "url", + [ + "https://help.com.au/?d=websitedesigntoorak.com.au", + "https://www.help.com.au/?d=foo.com", + "https://accounts.google.com/v3/signin?continue=x&gws_rd=ssl", + "https://wix.com/deactivate-domain", + "https://www.wix.com/parking?domain=x", + ], +) +def test_url_pattern_matches(url: str) -> None: + assert is_parking_redirect(url) + + +@pytest.mark.parametrize( + "url", + [ + "https://help.com.au/", # bare host w/o ?d= is not parking + "https://accounts.google.com/", # no gws_rd=ssl + "https://wix.com/about", # active Wix page + "https://example.com/", # unrelated + "", + ], +) +def test_non_parking(url: str) -> None: + assert not is_parking_redirect(url) + + +def test_homestead_stays_in_parking() -> None: + # Sanity: homestead.com was deduped out of platforms.py and must still + # be considered parking. See PR that introduced the parking/platforms + # split. + assert "homestead.com" in PARKING_REDIRECT_HOSTS + + +def test_url_patterns_non_empty() -> None: + assert URL_PARKING_PATTERNS + hosts = {host for host, _ in URL_PARKING_PATTERNS} + assert "help.com.au" in hosts diff --git a/tests/test_platforms.py b/tests/test_platforms.py new file mode 100644 index 0000000..6f1b7f2 --- /dev/null +++ b/tests/test_platforms.py @@ -0,0 +1,80 @@ +"""Tests for hosting/SaaS platform redirect detection.""" +from __future__ import annotations + +import pytest + +from domain_classifier.platforms import ( + PLATFORM_HOSTNAME_SUFFIXES, + PLATFORM_REDIRECT_HOSTS, + URL_PLATFORM_PATTERNS, + is_platform_host, + is_platform_url, +) + + +@pytest.mark.parametrize( + "url", + [ + "https://sites.google.com/site/foo", + "https://tv.apple.com/us/show/xyz", + "https://linkedin.com/in/someone", + "https://www.linkedin.com/company/acme", + "https://ebay.com/usr/seller123", + "https://amazon.com/dp/B0001", + "https://century21.com/agent/jane", + "https://login.microsoftonline.com/common/oauth2/authorize", + ], +) +def test_platform_url_matches(url: str) -> None: + assert is_platform_url(url) + + +@pytest.mark.parametrize( + "url", + [ + "https://linkedin.com/pulse/some-acquisition-news", # not a profile path + "https://ebay.com/itm/12345", # item, not usr/str/sch + "https://example.com/", + "", + ], +) +def test_non_platform_url(url: str) -> None: + assert not is_platform_url(url) + + +@pytest.mark.parametrize( + "host", + [ + "foo.cloudfront.net", + "cloudfront.net", + "abc.herokuapp.com", + "store.myshopify.com", + "x.y.z.amazonaws.com", + ], +) +def test_platform_host_matches(host: str) -> None: + assert is_platform_host(host) + + +@pytest.mark.parametrize( + "host", + [ + "example.com", + "", + "cloudfrontnet.com", # partial-word must not match + ], +) +def test_non_platform_host(host: str) -> None: + assert not is_platform_host(host) + + +def test_homestead_not_in_platforms() -> None: + # homestead.com is deliberately in parking.py instead (parking wins on + # overlap). Guard against accidental re-add. + assert "homestead.com" not in PLATFORM_REDIRECT_HOSTS + + +def test_structures_non_empty() -> None: + assert PLATFORM_REDIRECT_HOSTS + assert PLATFORM_HOSTNAME_SUFFIXES + assert URL_PLATFORM_PATTERNS