"""
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
"""
import json
import re
from typing import Any
from urllib.parse import urlparse
import httpx
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
# OpenRouter Base-URL und Modell
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
# Browser-Konfiguration gegen Bot-Detection (z. B. McKinsey) und HTTP/2-Fehler
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
)
EXTRA_HEADERS = {
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
"Referer": "https://www.google.com/",
}
VIEWPORT = {"width": 1920, "height": 1080}
GOTO_TIMEOUT = 60_000 # ms
def _normalize_domain(domain: str) -> str:
"""Extrahiert die reine Domain (Host) ohne Schema und Pfad."""
s = domain.strip()
if s.startswith("http://"):
s = s[7:]
elif s.startswith("https://"):
s = s[8:]
if "/" in s:
s = s.split("/", 1)[0]
return s.lower() or domain
def _url_with_www(domain: str) -> str:
"""Immer https://www.{domain} für einheitlichen Aufruf."""
host = _normalize_domain(domain)
return f"https://www.{host}"
def _url_without_www(domain: str) -> str:
"""https://{domain} für Fallback ohne www."""
host = _normalize_domain(domain)
return f"https://{host}"
async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[dict[str, str]]:
"""
Interne Implementierung: Browser starten, Stealth anwenden, Zielseite laden, Links extrahieren.
Bei disable_http2=True wird Chromium mit args=['--disable-http2'] gestartet (Fallback bei ERR_HTTP2_PROTOCOL_ERROR).
"""
url_primary = _url_with_www(domain)
url_insights = f"{url_primary.rstrip('/')}/featured-insights" # Direkt zur Insights-Seite (z. B. McKinsey)
url_fallback = _url_without_www(domain)
links: list[dict[str, str]] = []
launch_options: dict[str, Any] = {"headless": True}
if disable_http2:
launch_options["args"] = ["--disable-http2"]
async with async_playwright() as p:
browser = await p.chromium.launch(**launch_options)
try:
# Kontext: aktueller User-Agent, JavaScript an, Headers, Viewport (Bot-Detection umgehen)
context = await browser.new_context(
user_agent=USER_AGENT,
extra_http_headers=EXTRA_HEADERS,
viewport=VIEWPORT,
java_script_enabled=True,
)
# Stealth auf Kontext anwenden (API 2.x: apply_stealth_async(context))
await Stealth().apply_stealth_async(context)
page = await context.new_page()
# Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights)
try:
await page.goto(
url_insights,
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception:
# Versuch 2: Startseite mit www
try:
await page.goto(
url_primary,
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception:
# Versuch 3: ohne www
try:
await page.goto(
url_fallback,
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception:
# Versuch 4: domcontentloaded als letzter Fallback
try:
await page.goto(
url_primary,
wait_until="domcontentloaded",
timeout=30_000,
)
except Exception:
await page.goto(
url_fallback,
wait_until="domcontentloaded",
timeout=30_000,
)
# Alle auslesen
links = await page.evaluate(
"""() => {
const anchors = document.querySelectorAll('a[href]');
return Array.from(anchors).map(a => ({
text: (a.textContent || '').trim().slice(0, 200),
href: a.getAttribute('href') || ''
})).filter(x => x.href);
}"""
)
await context.close()
finally:
await browser.close()
return links
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
"""
Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert -Tags.
Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2.
"""
try:
return await _fetch_links_impl(domain, disable_http2=False)
except Exception as e:
err_msg = str(e)
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
return await _fetch_links_impl(domain, disable_http2=True)
raise
def _make_absolute(href: str, base_url: str) -> str:
"""Macht relative URLs absolut (einfache Heuristik)."""
if not href or href.startswith("#"):
return ""
if href.startswith("http://") or href.startswith("https://"):
return href
base = base_url.rstrip("/")
if href.startswith("/"):
parsed = urlparse(base)
return f"{parsed.scheme}://{parsed.netloc}{href}"
return f"{base}/{href}"
async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str) -> str | None:
"""
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
Erwartet Antwort im Format: {"url": "..."} – unverändert.
"""
base_url = _url_with_www(domain)
prompt = (
"Analysiere diese Links einer Unternehmensberatung. "
"Welcher Link führt zur Seite mit Reports, Insights oder Fachartikeln? "
"Antworte NUR mit der absoluten URL im JSON-Format: {'url': '...'}"
)
links_text = "\n".join(
f"- {l.get('text', '')} -> {_make_absolute(l.get('href', ''), base_url)}"
for l in links[:80] # Begrenzen, um Token-Limit zu schonen
)
user_content = f"Links von {domain}:\n{links_text}"
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{OPENROUTER_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"HTTP-Referer": base_url,
},
json={
"model": DEFAULT_MODEL,
"messages": [
{"role": "system", "content": prompt},
{"role": "user", "content": user_content},
],
"max_tokens": 256,
},
)
resp.raise_for_status()
data = resp.json()
choice = (data.get("choices") or [None])[0]
if not choice:
return None
content = (choice.get("message") or {}).get("content") or ""
if not content.strip():
return None
# JSON aus Antwort extrahieren (falls von Markdown umgeben)
content = content.strip()
json_match = re.search(r"\{[^{}]*\"url\"[^{}]*\}", content)
if json_match:
try:
obj = json.loads(json_match.group())
return (obj.get("url") or "").strip() or None
except json.JSONDecodeError:
pass
try:
obj = json.loads(content)
return (obj.get("url") or "").strip() or None
except json.JSONDecodeError:
return None
async def get_publication_url(domain: str, *, api_key: str | None = None) -> dict[str, Any]:
"""
Hauptfunktion: Domain scannen, Links an OpenRouter senden,
gefundene Publikations-URL zurückgeben.
"""
import os
key = api_key or os.getenv("OPENROUTER_API_KEY")
if not key:
return {"url": None, "error": "OPENROUTER_API_KEY nicht gesetzt"}
try:
links = await _fetch_links_with_playwright(domain)
except Exception as e:
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
if not links:
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
try:
url = await _ask_openrouter(key, links, domain)
return {"url": url, "error": None}
except Exception as e:
return {"url": None, "error": f"OpenRouter-Fehler: {e!s}"}