Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 2 additions & 44 deletions domain_classifier/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
import time
from typing import Optional

from urllib.parse import urlparse

from domain_classifier.config import settings
from domain_classifier.models.domain import DomainDetails, DomainResult
from domain_classifier.parking import is_parking_redirect
from domain_classifier.pipeline.content import analyze_content
from domain_classifier.pipeline.dns_check import check_dns
from domain_classifier.pipeline.fetcher import fetch
Expand Down Expand Up @@ -116,47 +115,6 @@ def _is_government_domain(domain: str) -> bool:
return False


# Domains that serve parking / for-sale landers — redirecting to these means
# the original domain is parked, not simply forwarded to live content.
_PARKING_REDIRECT_HOSTS = {
"forsale.godaddy.com", "godaddy.com",
"sedo.com", "sedoparking.com",
"dan.com", "afternic.com",
"hugedomains.com", "domainmarket.com",
"parkingcrew.net", "bodis.com",
"above.com", "epik.com",
"undeveloped.com", "efty.com",
"buydomains.com", "domcop.com",
"squadhelp.com",
# Content farm / affiliate parking networks
"searchhounds.com", "dot-software.org", "dot-film.org", "dot-loans.org",
# Affiliate monetization — domains redirected here are monetized abandoned/expired
"gocomper.com",
# Expired domain marketplaces
"expireddomains.com",
# Traffic arbitrage / search redirect networks
"resultlookup.com", "onlineresultfinder.com",
"trkflow.xyz",
# Registrar parking (Network Solutions)
"networksolutions.com",
}


def _is_parking_redirect(final_url: str) -> bool:
"""Return True if the redirect destination is a known parking/for-sale provider."""
parsed = urlparse(final_url)
host = parsed.netloc.lower().removeprefix("www.")
if host in _PARKING_REDIRECT_HOSTS or any(host.endswith("." + p) for p in _PARKING_REDIRECT_HOSTS):
return True
# Content farm affiliate network signature: ?psystem=XX&domain=<original>&oref=...
# This URL pattern is used by multiple dot-* domains and partner networks to
# redirect expired/parked domains through SEO article landers.
q = parsed.query
if "psystem=" in q and "domain=" in q and "oref=" in q:
return True
return False


async def classify_domain(domain: str) -> DomainResult:
"""Run the full classification pipeline for a single domain."""
domain = domain.strip().lower().removeprefix("https://").removeprefix("http://").split("/")[0]
Expand Down Expand Up @@ -239,7 +197,7 @@ async def classify_domain(domain: str) -> DomainResult:

if fetch_result.redirected_to_other_domain:
details.redirects_to = fetch_result.final_url
if _is_parking_redirect(fetch_result.final_url or ""):
if is_parking_redirect(fetch_result.final_url or ""):
result.classification = "parked"
result.classification_score = 1.0
result.grade = assign_grade("parked", 1.0)
Expand Down
226 changes: 226 additions & 0 deletions domain_classifier/parking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
"""
Parking domain detection — reference data and redirect classification.

PARKING_REDIRECT_HOSTS is the canonical set of domains that serve parking or
for-sale landers. If a domain's HTTP fetch resolves to one of these hosts
(exact match or subdomain), the source domain is classified as parked rather
than a live redirect.

Organised by category so additions are easy to place correctly.
"""
from __future__ import annotations

from urllib.parse import urlparse


# ---------------------------------------------------------------------------
# Reference data — parking / for-sale redirect destinations
# ---------------------------------------------------------------------------

# Domain for-sale marketplaces — broker or auction sites where expired/listed
# domains land when put up for sale.
_FOR_SALE_MARKETPLACES = {
"dan.com", # Formerly undeveloped.com auction arm; now part of GoDaddy
"afternic.com", # GoDaddy premium marketplace
"sedo.com", # Major international marketplace
"hugedomains.com", # Large portfolio seller
"domainmarket.com",
"efty.com", # Marketplace platform
"buydomains.com",
"domcop.com", # Expired domain marketplace
"squadhelp.com", # Brandable name marketplace
"expireddomains.com",
"flippa.com", # Domain + website marketplace
"saw.com", # Sedo Afternic Wholesale
"sav.com", # Domain marketplace / parking hybrid
"odys.co", # Aged domain marketplace (was undeveloped.com)
"brandpa.com", # Brandable domain marketplace
"greatdomains.com", # Premium domain sales
"atom.com", # Short domain marketplace
"epik.com", # Registrar with active for-sale marketplace
"undeveloped.com", # Legacy URL; kept for older redirects
"perfectdomain.com", # Domain for-sale marketplace
"uniregistry.com", # Frank Schilling marketplace; acquired by GoDaddy
"brandbucket.com", # Brandable domain marketplace
"aftermarket.pl", # Polish domain marketplace
"domainmarkt.de", # German domain marketplace
"domainname.ru", # Russian domain marketplace
"com.com", # Domains redirected here are listed for sale
"sawbrokers.com", # Domain broker
"venture.com", # Domain portfolio / parking
"domainworld.org", # Domain marketplace
"directdomains.com", # Domain broker
"dropcatch.com", # Expired domain auction / catch service
"dynadot.com", # Registrar with active parking/for-sale lander
"uk.com", # UK domain reseller; parked domains land here
}

# Registrar-operated parking pages — the registrar's own placeholder/for-sale
# lander shown when a domain resolves but has no real content.
_REGISTRAR_PARKING = {
"forsale.godaddy.com", # GoDaddy for-sale lander (subdomain)
"godaddy.com", # GoDaddy generic parking fallback
"secureserver.net", # GoDaddy / Wild West Domains hosting network; parked lander
"myregisteredsite.com", # GoDaddy "coming soon" / placeholder page
"sedoparking.com", # Sedo parking CDN
"parkingcrew.net", # GoDaddy/Afternic parking network
"networksolutions.com", # Web.com / Network Solutions registrar parking
"parked.com", # Generic parking provider
"parking.com", # Generic parking provider
"domainparking.com", # Generic parking provider
"reg.ru", # Russian registrar; parked domains redirect here
"expiredwixdomain.com", # Wix expired/cancelled domain lander
"homestead.com", # Expired Homestead website builder domains land here
"hostgator.com", # Suspended HostGator accounts redirect here
"freenom.com", # Free domain registrar; expired domains return here
"affordablewebhosting.com", # Hosting placeholder / parking
"goneo.de", # German hosting with parked domain landers
"website-start.de", # German hosting placeholder page
"netsons.com", # Italian hosting with parking landers
"home.pl", # Polish registrar; parked domains redirect here
}

# Parking monetisation networks — ad-revenue sharing platforms that serve PPC
# landers on parked domains.
_PARKING_MONETISATION = {
"bodis.com", # Domain monetisation network
"above.com", # Domain monetisation network
"domainsponsor.com", # Established PPC parking network
"cashparking.com", # PPC parking
"trellian.com", # Domain advertising / parking
"smartname.com", # Parking monetisation
"parklogic.com", # Parking yield optimiser
"namedrive.com", # Domain parking provider
"whypark.com", # Self-serve parking platform
"parkingpanther.com", # Parking provider
"dpml.com", # Donuts Premium Monetisation Layer
"domainzaar.com", # Domain monetisation
"simcast.com", # Domain parking service
"verifymywhois.com", # WHOIS verification / parking placeholder
}

# Traffic arbitrage and search redirect networks — domains redirected here are
# monetised through click-through ad revenue, not parked in the traditional
# sense but indistinguishable from a user perspective.
_TRAFFIC_ARBITRAGE = {
"resultlookup.com",
"onlineresultfinder.com",
"findresultsonline.com",
"findresultsquick.com",
"bestofbrowsing.com",
"trkflow.xyz",
"trafficvance.com",
"hitfarm.com", # Click-farm traffic arbitrage
"trafficz.com",
"searchhounds.com",
"citygrid.com", # Local business PPC / search redirect
"himado.com", # Search redirect / traffic arbitrage
"perfdrive.com", # Traffic arbitrage
"quotes.com", # Content farm / PPC lander
"googleblog.com", # Domain squatter; redirected domains land on PPC lander
}

# Content farm affiliate networks — expired domains redirected through these
# networks serve SEO article landers monetised via affiliate commissions.
# Many share the same URL signature: ?psystem=XX&domain=<orig>&oref=...
_CONTENT_FARM_AFFILIATES = {
"cdn-fileserver.com",
"gocomper.com",
"dot-software.org",
"dot-film.org",
"dot-loans.org",
"funtimefarms.com", # Local-interest content farm lander
"importantlocalbusinesses.com",# Local business directory content farm
"dealercarsearch.com", # Auto dealer content farm / affiliate lander
"offcarrot.com", # Affiliate / content farm lander
"survey-smiles.com", # Survey affiliate lander
"supportcharity.com", # Charity affiliate lander
"registrar-transfers.com", # Domain transfer placeholder / affiliate
# Low-signal / obfuscated parking network nodes
"xcfss.xyz",
"dnfs24.com",
"haxbyq.com",
"static.uni5.net",
"shbzek.com",
"szqxvo.com",
"asry4eyw2lqk6.com",
"evo-media.eu",
"realtydao.com",
"daaz.com",
"contrib.com",
}

# ---------------------------------------------------------------------------
# URL-pattern parking rules
#
# Some major brands serve pages that look like acquisitions when a domain
# redirects there, but are actually parking indicators when the URL matches
# a specific pattern. Format: (host, url_substring_that_must_match).
#
# Used by is_parking_redirect() — both the host AND the substring must match.
# Add new entries below; keep them sorted by host for readability.
# ---------------------------------------------------------------------------

URL_PARKING_PATTERNS: list[tuple[str, str]] = [
# Google Workspace / G Suite — when a domain's account expires or is
# deprovisioned, HTTP requests are bounced to the Google sign-in page
# with gws_rd=ssl. This is not an acquisition; the domain is dead.
("accounts.google.com", "gws_rd=ssl"),

# Servers Australia "help.com.au" — hosting-provider parking page shown
# for expired/unclaimed customer domains. Landing URL includes ?d=<source>.
# Without this pattern the redirect looks like an acquisition signal
# (help.com.au is a live hosting co with its own ultimate), so the ?d=
# marker is the surgical indicator that the source is parked.
("help.com.au", "?d="),

# Wix — explicitly deactivated or parked Wix sites. Generic Wix redirects
# are caught as "platform" in redirect_detector; these specific URL patterns
# indicate the domain itself is dead, warranting grade C.
("wix.com", "parking?"),
("wix.com", "deactivate-domain"),
]


# Canonical union — this is the set used for redirect classification.
PARKING_REDIRECT_HOSTS: frozenset[str] = frozenset(
_FOR_SALE_MARKETPLACES
| _REGISTRAR_PARKING
| _PARKING_MONETISATION
| _TRAFFIC_ARBITRAGE
| _CONTENT_FARM_AFFILIATES
)


# ---------------------------------------------------------------------------
# Detection
# ---------------------------------------------------------------------------

def is_parking_redirect(final_url: str) -> bool:
"""Return True if the redirect destination is a known parking or for-sale provider.

Matches on exact host (after stripping www.) and any subdomain thereof.
Also catches content farm affiliate networks via their shared query-string
signature (?psystem=...&domain=...&oref=...).
"""
parsed = urlparse(final_url)
host = parsed.netloc.lower().removeprefix("www.")

if host in PARKING_REDIRECT_HOSTS:
return True
if any(host.endswith("." + p) for p in PARKING_REDIRECT_HOSTS):
return True

# Content farm affiliate signature shared across dot-* domains and partners:
# ?psystem=XX&domain=<original>&oref=...
q = parsed.query
if "psystem=" in q and "domain=" in q and "oref=" in q:
return True

# URL-pattern rules: major-brand hosts that serve parking/dead-account pages
# at specific URLs. Both the host and the substring must match.
for pattern_host, pattern_substr in URL_PARKING_PATTERNS:
if host == pattern_host or host.endswith("." + pattern_host):
if pattern_substr in final_url:
return True
return False
Loading