Enhanced scout_logic.py with improved browser configuration to bypass bot detection, added URL normalization functions, and implemented robust error handling for fetching links.
This commit is contained in:
parent
6e813daf69
commit
afee46933f
|
|
@ -13,21 +13,95 @@ from playwright.async_api import async_playwright
|
|||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
||||
|
||||
# Browser-Konfiguration gegen Bot-Detection (z. B. McKinsey) und HTTP/2-Fehler
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
|
||||
)
|
||||
EXTRA_HEADERS = {
|
||||
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
"Referer": "https://www.google.com/",
|
||||
}
|
||||
VIEWPORT = {"width": 1920, "height": 1080}
|
||||
GOTO_TIMEOUT = 60_000 # ms
|
||||
|
||||
|
||||
def _normalize_domain(domain: str) -> str:
|
||||
"""Extrahiert die reine Domain (Host) ohne Schema und Pfad."""
|
||||
s = domain.strip()
|
||||
if s.startswith("http://"):
|
||||
s = s[7:]
|
||||
elif s.startswith("https://"):
|
||||
s = s[8:]
|
||||
if "/" in s:
|
||||
s = s.split("/", 1)[0]
|
||||
return s.lower() or domain
|
||||
|
||||
|
||||
def _url_with_www(domain: str) -> str:
|
||||
"""Immer https://www.{domain} für einheitlichen Aufruf."""
|
||||
host = _normalize_domain(domain)
|
||||
return f"https://www.{host}"
|
||||
|
||||
|
||||
def _url_without_www(domain: str) -> str:
|
||||
"""https://{domain} für Fallback ohne www."""
|
||||
host = _normalize_domain(domain)
|
||||
return f"https://{host}"
|
||||
|
||||
|
||||
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||
"""
|
||||
Lädt die Startseite der Domain mit Playwright (headless) und
|
||||
extrahiert alle <a>-Tags (Text und Href).
|
||||
Lädt die Startseite der Domain mit Playwright (headless),
|
||||
mit Browser-Kontext gegen Bot-Detection, und extrahiert alle <a>-Tags.
|
||||
Bei Fehler: Fallback ohne www bzw. mit domcontentloaded.
|
||||
"""
|
||||
# Domain mit Schema normalisieren
|
||||
url = domain if domain.startswith("http") else f"https://{domain}"
|
||||
# Immer einheitliches URL-Format: https://www.{domain}
|
||||
url_primary = _url_with_www(domain)
|
||||
url_fallback = _url_without_www(domain)
|
||||
links: list[dict[str, str]] = []
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
||||
# Kontext mit echtem User-Agent, Headers und Viewport (Bot-Detection umgehen)
|
||||
context = await browser.new_context(
|
||||
user_agent=USER_AGENT,
|
||||
extra_http_headers=EXTRA_HEADERS,
|
||||
viewport=VIEWPORT,
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
# Versuch 1: https://www.{domain} mit networkidle (60s)
|
||||
try:
|
||||
await page.goto(
|
||||
url_primary,
|
||||
wait_until="networkidle",
|
||||
timeout=GOTO_TIMEOUT,
|
||||
)
|
||||
except Exception:
|
||||
# Versuch 2: ohne www
|
||||
try:
|
||||
await page.goto(
|
||||
url_fallback,
|
||||
wait_until="networkidle",
|
||||
timeout=GOTO_TIMEOUT,
|
||||
)
|
||||
except Exception:
|
||||
# Versuch 3: mit domcontentloaded (schneller, robuster bei instabiler Verbindung)
|
||||
try:
|
||||
await page.goto(
|
||||
url_primary,
|
||||
wait_until="domcontentloaded",
|
||||
timeout=30_000,
|
||||
)
|
||||
except Exception:
|
||||
await page.goto(
|
||||
url_fallback,
|
||||
wait_until="domcontentloaded",
|
||||
timeout=30_000,
|
||||
)
|
||||
|
||||
# Alle <a href="..."> auslesen
|
||||
links = await page.evaluate(
|
||||
"""() => {
|
||||
|
|
@ -38,6 +112,7 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
|||
})).filter(x => x.href);
|
||||
}"""
|
||||
)
|
||||
await context.close()
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
|
|
@ -60,9 +135,9 @@ def _make_absolute(href: str, base_url: str) -> str:
|
|||
async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str) -> str | None:
|
||||
"""
|
||||
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
|
||||
Erwartet Antwort im Format: {"url": "..."}
|
||||
Erwartet Antwort im Format: {"url": "..."} – unverändert.
|
||||
"""
|
||||
base_url = domain if domain.startswith("http") else f"https://{domain}"
|
||||
base_url = _url_with_www(domain)
|
||||
prompt = (
|
||||
"Analysiere diese Links einer Unternehmensberatung. "
|
||||
"Welcher Link führt zur Seite mit Reports, Insights oder Fachartikeln? "
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user