From 9c5f7694552ba1338040371dc65e1706686afc3b Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 31 Jan 2026 18:08:40 +0100 Subject: [PATCH] Added playwright-stealth dependency and refactored link fetching logic in scout_logic.py to enhance bot detection evasion and implement HTTP/2 fallback handling. --- requirements.txt | 1 + src/scout_logic.py | 70 +++++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 20 deletions(-) diff --git a/requirements.txt b/requirements.txt index 67eaf9f..f37adee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ beautifulsoup4>=4.12.0 httpx>=0.26.0 python-dotenv>=1.0.0 pydantic>=2.5.0 +playwright-stealth diff --git a/src/scout_logic.py b/src/scout_logic.py index eab8aef..aa1af26 100644 --- a/src/scout_logic.py +++ b/src/scout_logic.py @@ -1,5 +1,6 @@ """ Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren. +Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection. """ import json import re @@ -8,6 +9,7 @@ from urllib.parse import urlparse import httpx from playwright.async_api import async_playwright +from playwright_stealth import stealth_async # OpenRouter Base-URL und Modell OPENROUTER_BASE = "https://openrouter.ai/api/v1" @@ -50,57 +52,71 @@ def _url_without_www(domain: str) -> str: return f"https://{host}" -async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]: +async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[dict[str, str]]: """ - Lädt die Startseite der Domain mit Playwright (headless), - mit Browser-Kontext gegen Bot-Detection, und extrahiert alle -Tags. - Bei Fehler: Fallback ohne www bzw. mit domcontentloaded. + Interne Implementierung: Browser starten, Stealth anwenden, Zielseite laden, Links extrahieren. + Bei disable_http2=True wird Chromium mit args=['--disable-http2'] gestartet (Fallback bei ERR_HTTP2_PROTOCOL_ERROR). """ - # Immer einheitliches URL-Format: https://www.{domain} url_primary = _url_with_www(domain) + url_insights = f"{url_primary.rstrip('/')}/featured-insights" # Direkt zur Insights-Seite (z. B. McKinsey) url_fallback = _url_without_www(domain) links: list[dict[str, str]] = [] + launch_options: dict[str, Any] = {"headless": True} + if disable_http2: + launch_options["args"] = ["--disable-http2"] + async with async_playwright() as p: - browser = await p.chromium.launch(headless=True) + browser = await p.chromium.launch(**launch_options) try: - # Kontext mit echtem User-Agent, Headers und Viewport (Bot-Detection umgehen) + # Kontext: aktueller User-Agent, JavaScript an, Headers, Viewport (Bot-Detection umgehen) context = await browser.new_context( user_agent=USER_AGENT, extra_http_headers=EXTRA_HEADERS, viewport=VIEWPORT, + java_script_enabled=True, ) page = await context.new_page() + # Stealth – entscheidend gegen extreme Bot-Detection + await stealth_async(page) - # Versuch 1: https://www.{domain} mit networkidle (60s) + # Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights) try: await page.goto( - url_primary, + url_insights, wait_until="networkidle", timeout=GOTO_TIMEOUT, ) except Exception: - # Versuch 2: ohne www + # Versuch 2: Startseite mit www try: await page.goto( - url_fallback, + url_primary, wait_until="networkidle", timeout=GOTO_TIMEOUT, ) except Exception: - # Versuch 3: mit domcontentloaded (schneller, robuster bei instabiler Verbindung) + # Versuch 3: ohne www try: await page.goto( - url_primary, - wait_until="domcontentloaded", - timeout=30_000, + url_fallback, + wait_until="networkidle", + timeout=GOTO_TIMEOUT, ) except Exception: - await page.goto( - url_fallback, - wait_until="domcontentloaded", - timeout=30_000, - ) + # Versuch 4: domcontentloaded als letzter Fallback + try: + await page.goto( + url_primary, + wait_until="domcontentloaded", + timeout=30_000, + ) + except Exception: + await page.goto( + url_fallback, + wait_until="domcontentloaded", + timeout=30_000, + ) # Alle auslesen links = await page.evaluate( @@ -119,6 +135,20 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]: return links +async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]: + """ + Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert -Tags. + Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2. + """ + try: + return await _fetch_links_impl(domain, disable_http2=False) + except Exception as e: + err_msg = str(e) + if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg: + return await _fetch_links_impl(domain, disable_http2=True) + raise + + def _make_absolute(href: str, base_url: str) -> str: """Macht relative URLs absolut (einfache Heuristik).""" if not href or href.startswith("#"):