Added playwright-stealth dependency and refactored link fetching logic in scout_logic.py to enhance bot detection evasion and implement HTTP/2 fallback handling.
This commit is contained in:
parent
afee46933f
commit
9c5f769455
|
|
@ -6,3 +6,4 @@ beautifulsoup4>=4.12.0
|
||||||
httpx>=0.26.0
|
httpx>=0.26.0
|
||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
pydantic>=2.5.0
|
pydantic>=2.5.0
|
||||||
|
playwright-stealth
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
"""
|
"""
|
||||||
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
|
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
|
||||||
|
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
@ -8,6 +9,7 @@ from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
from playwright_stealth import stealth_async
|
||||||
|
|
||||||
# OpenRouter Base-URL und Modell
|
# OpenRouter Base-URL und Modell
|
||||||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||||
|
|
@ -50,29 +52,43 @@ def _url_without_www(domain: str) -> str:
|
||||||
return f"https://{host}"
|
return f"https://{host}"
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[dict[str, str]]:
|
||||||
"""
|
"""
|
||||||
Lädt die Startseite der Domain mit Playwright (headless),
|
Interne Implementierung: Browser starten, Stealth anwenden, Zielseite laden, Links extrahieren.
|
||||||
mit Browser-Kontext gegen Bot-Detection, und extrahiert alle <a>-Tags.
|
Bei disable_http2=True wird Chromium mit args=['--disable-http2'] gestartet (Fallback bei ERR_HTTP2_PROTOCOL_ERROR).
|
||||||
Bei Fehler: Fallback ohne www bzw. mit domcontentloaded.
|
|
||||||
"""
|
"""
|
||||||
# Immer einheitliches URL-Format: https://www.{domain}
|
|
||||||
url_primary = _url_with_www(domain)
|
url_primary = _url_with_www(domain)
|
||||||
|
url_insights = f"{url_primary.rstrip('/')}/featured-insights" # Direkt zur Insights-Seite (z. B. McKinsey)
|
||||||
url_fallback = _url_without_www(domain)
|
url_fallback = _url_without_www(domain)
|
||||||
links: list[dict[str, str]] = []
|
links: list[dict[str, str]] = []
|
||||||
|
|
||||||
|
launch_options: dict[str, Any] = {"headless": True}
|
||||||
|
if disable_http2:
|
||||||
|
launch_options["args"] = ["--disable-http2"]
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=True)
|
browser = await p.chromium.launch(**launch_options)
|
||||||
try:
|
try:
|
||||||
# Kontext mit echtem User-Agent, Headers und Viewport (Bot-Detection umgehen)
|
# Kontext: aktueller User-Agent, JavaScript an, Headers, Viewport (Bot-Detection umgehen)
|
||||||
context = await browser.new_context(
|
context = await browser.new_context(
|
||||||
user_agent=USER_AGENT,
|
user_agent=USER_AGENT,
|
||||||
extra_http_headers=EXTRA_HEADERS,
|
extra_http_headers=EXTRA_HEADERS,
|
||||||
viewport=VIEWPORT,
|
viewport=VIEWPORT,
|
||||||
|
java_script_enabled=True,
|
||||||
)
|
)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
# Stealth – entscheidend gegen extreme Bot-Detection
|
||||||
|
await stealth_async(page)
|
||||||
|
|
||||||
# Versuch 1: https://www.{domain} mit networkidle (60s)
|
# Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights)
|
||||||
|
try:
|
||||||
|
await page.goto(
|
||||||
|
url_insights,
|
||||||
|
wait_until="networkidle",
|
||||||
|
timeout=GOTO_TIMEOUT,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Versuch 2: Startseite mit www
|
||||||
try:
|
try:
|
||||||
await page.goto(
|
await page.goto(
|
||||||
url_primary,
|
url_primary,
|
||||||
|
|
@ -80,7 +96,7 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
timeout=GOTO_TIMEOUT,
|
timeout=GOTO_TIMEOUT,
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Versuch 2: ohne www
|
# Versuch 3: ohne www
|
||||||
try:
|
try:
|
||||||
await page.goto(
|
await page.goto(
|
||||||
url_fallback,
|
url_fallback,
|
||||||
|
|
@ -88,7 +104,7 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
timeout=GOTO_TIMEOUT,
|
timeout=GOTO_TIMEOUT,
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
# Versuch 3: mit domcontentloaded (schneller, robuster bei instabiler Verbindung)
|
# Versuch 4: domcontentloaded als letzter Fallback
|
||||||
try:
|
try:
|
||||||
await page.goto(
|
await page.goto(
|
||||||
url_primary,
|
url_primary,
|
||||||
|
|
@ -119,6 +135,20 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert <a>-Tags.
|
||||||
|
Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return await _fetch_links_impl(domain, disable_http2=False)
|
||||||
|
except Exception as e:
|
||||||
|
err_msg = str(e)
|
||||||
|
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
|
||||||
|
return await _fetch_links_impl(domain, disable_http2=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _make_absolute(href: str, base_url: str) -> str:
|
def _make_absolute(href: str, base_url: str) -> str:
|
||||||
"""Macht relative URLs absolut (einfache Heuristik)."""
|
"""Macht relative URLs absolut (einfache Heuristik)."""
|
||||||
if not href or href.startswith("#"):
|
if not href or href.startswith("#"):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user