diff --git a/requirements.txt b/requirements.txt
index 67eaf9f..f37adee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ beautifulsoup4>=4.12.0
httpx>=0.26.0
python-dotenv>=1.0.0
pydantic>=2.5.0
+playwright-stealth
diff --git a/src/scout_logic.py b/src/scout_logic.py
index eab8aef..aa1af26 100644
--- a/src/scout_logic.py
+++ b/src/scout_logic.py
@@ -1,5 +1,6 @@
"""
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
+Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
"""
import json
import re
@@ -8,6 +9,7 @@ from urllib.parse import urlparse
import httpx
from playwright.async_api import async_playwright
+from playwright_stealth import stealth_async
# OpenRouter Base-URL und Modell
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
@@ -50,57 +52,71 @@ def _url_without_www(domain: str) -> str:
return f"https://{host}"
-async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
+async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[dict[str, str]]:
"""
- Lädt die Startseite der Domain mit Playwright (headless),
- mit Browser-Kontext gegen Bot-Detection, und extrahiert alle -Tags.
- Bei Fehler: Fallback ohne www bzw. mit domcontentloaded.
+ Interne Implementierung: Browser starten, Stealth anwenden, Zielseite laden, Links extrahieren.
+ Bei disable_http2=True wird Chromium mit args=['--disable-http2'] gestartet (Fallback bei ERR_HTTP2_PROTOCOL_ERROR).
"""
- # Immer einheitliches URL-Format: https://www.{domain}
url_primary = _url_with_www(domain)
+ url_insights = f"{url_primary.rstrip('/')}/featured-insights" # Direkt zur Insights-Seite (z. B. McKinsey)
url_fallback = _url_without_www(domain)
links: list[dict[str, str]] = []
+ launch_options: dict[str, Any] = {"headless": True}
+ if disable_http2:
+ launch_options["args"] = ["--disable-http2"]
+
async with async_playwright() as p:
- browser = await p.chromium.launch(headless=True)
+ browser = await p.chromium.launch(**launch_options)
try:
- # Kontext mit echtem User-Agent, Headers und Viewport (Bot-Detection umgehen)
+ # Kontext: aktueller User-Agent, JavaScript an, Headers, Viewport (Bot-Detection umgehen)
context = await browser.new_context(
user_agent=USER_AGENT,
extra_http_headers=EXTRA_HEADERS,
viewport=VIEWPORT,
+ java_script_enabled=True,
)
page = await context.new_page()
+ # Stealth – entscheidend gegen extreme Bot-Detection
+ await stealth_async(page)
- # Versuch 1: https://www.{domain} mit networkidle (60s)
+ # Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights)
try:
await page.goto(
- url_primary,
+ url_insights,
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception:
- # Versuch 2: ohne www
+ # Versuch 2: Startseite mit www
try:
await page.goto(
- url_fallback,
+ url_primary,
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception:
- # Versuch 3: mit domcontentloaded (schneller, robuster bei instabiler Verbindung)
+ # Versuch 3: ohne www
try:
await page.goto(
- url_primary,
- wait_until="domcontentloaded",
- timeout=30_000,
+ url_fallback,
+ wait_until="networkidle",
+ timeout=GOTO_TIMEOUT,
)
except Exception:
- await page.goto(
- url_fallback,
- wait_until="domcontentloaded",
- timeout=30_000,
- )
+ # Versuch 4: domcontentloaded als letzter Fallback
+ try:
+ await page.goto(
+ url_primary,
+ wait_until="domcontentloaded",
+ timeout=30_000,
+ )
+ except Exception:
+ await page.goto(
+ url_fallback,
+ wait_until="domcontentloaded",
+ timeout=30_000,
+ )
# Alle auslesen
links = await page.evaluate(
@@ -119,6 +135,20 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
return links
+async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
+ """
+ Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert -Tags.
+ Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2.
+ """
+ try:
+ return await _fetch_links_impl(domain, disable_http2=False)
+ except Exception as e:
+ err_msg = str(e)
+ if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
+ return await _fetch_links_impl(domain, disable_http2=True)
+ raise
+
+
def _make_absolute(href: str, base_url: str) -> str:
"""Macht relative URLs absolut (einfache Heuristik)."""
if not href or href.startswith("#"):