Enhanced scout_logic.py with improved browser configuration to bypass bot detection, added URL normalization functions, and implemented robust error handling for fetching links.
This commit is contained in:
parent
6e813daf69
commit
afee46933f
|
|
@ -13,21 +13,95 @@ from playwright.async_api import async_playwright
|
||||||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||||
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
||||||
|
|
||||||
|
# Browser-Konfiguration gegen Bot-Detection (z. B. McKinsey) und HTTP/2-Fehler
|
||||||
|
USER_AGENT = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
EXTRA_HEADERS = {
|
||||||
|
"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||||
|
"Referer": "https://www.google.com/",
|
||||||
|
}
|
||||||
|
VIEWPORT = {"width": 1920, "height": 1080}
|
||||||
|
GOTO_TIMEOUT = 60_000 # ms
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_domain(domain: str) -> str:
|
||||||
|
"""Extrahiert die reine Domain (Host) ohne Schema und Pfad."""
|
||||||
|
s = domain.strip()
|
||||||
|
if s.startswith("http://"):
|
||||||
|
s = s[7:]
|
||||||
|
elif s.startswith("https://"):
|
||||||
|
s = s[8:]
|
||||||
|
if "/" in s:
|
||||||
|
s = s.split("/", 1)[0]
|
||||||
|
return s.lower() or domain
|
||||||
|
|
||||||
|
|
||||||
|
def _url_with_www(domain: str) -> str:
|
||||||
|
"""Immer https://www.{domain} für einheitlichen Aufruf."""
|
||||||
|
host = _normalize_domain(domain)
|
||||||
|
return f"https://www.{host}"
|
||||||
|
|
||||||
|
|
||||||
|
def _url_without_www(domain: str) -> str:
|
||||||
|
"""https://{domain} für Fallback ohne www."""
|
||||||
|
host = _normalize_domain(domain)
|
||||||
|
return f"https://{host}"
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
"""
|
"""
|
||||||
Lädt die Startseite der Domain mit Playwright (headless) und
|
Lädt die Startseite der Domain mit Playwright (headless),
|
||||||
extrahiert alle <a>-Tags (Text und Href).
|
mit Browser-Kontext gegen Bot-Detection, und extrahiert alle <a>-Tags.
|
||||||
|
Bei Fehler: Fallback ohne www bzw. mit domcontentloaded.
|
||||||
"""
|
"""
|
||||||
# Domain mit Schema normalisieren
|
# Immer einheitliches URL-Format: https://www.{domain}
|
||||||
url = domain if domain.startswith("http") else f"https://{domain}"
|
url_primary = _url_with_www(domain)
|
||||||
|
url_fallback = _url_without_www(domain)
|
||||||
links: list[dict[str, str]] = []
|
links: list[dict[str, str]] = []
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=True)
|
browser = await p.chromium.launch(headless=True)
|
||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
# Kontext mit echtem User-Agent, Headers und Viewport (Bot-Detection umgehen)
|
||||||
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
context = await browser.new_context(
|
||||||
|
user_agent=USER_AGENT,
|
||||||
|
extra_http_headers=EXTRA_HEADERS,
|
||||||
|
viewport=VIEWPORT,
|
||||||
|
)
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# Versuch 1: https://www.{domain} mit networkidle (60s)
|
||||||
|
try:
|
||||||
|
await page.goto(
|
||||||
|
url_primary,
|
||||||
|
wait_until="networkidle",
|
||||||
|
timeout=GOTO_TIMEOUT,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Versuch 2: ohne www
|
||||||
|
try:
|
||||||
|
await page.goto(
|
||||||
|
url_fallback,
|
||||||
|
wait_until="networkidle",
|
||||||
|
timeout=GOTO_TIMEOUT,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Versuch 3: mit domcontentloaded (schneller, robuster bei instabiler Verbindung)
|
||||||
|
try:
|
||||||
|
await page.goto(
|
||||||
|
url_primary,
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
timeout=30_000,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
await page.goto(
|
||||||
|
url_fallback,
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
timeout=30_000,
|
||||||
|
)
|
||||||
|
|
||||||
# Alle <a href="..."> auslesen
|
# Alle <a href="..."> auslesen
|
||||||
links = await page.evaluate(
|
links = await page.evaluate(
|
||||||
"""() => {
|
"""() => {
|
||||||
|
|
@ -38,6 +112,7 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
})).filter(x => x.href);
|
})).filter(x => x.href);
|
||||||
}"""
|
}"""
|
||||||
)
|
)
|
||||||
|
await context.close()
|
||||||
finally:
|
finally:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
|
|
@ -60,9 +135,9 @@ def _make_absolute(href: str, base_url: str) -> str:
|
||||||
async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str) -> str | None:
|
async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str) -> str | None:
|
||||||
"""
|
"""
|
||||||
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
|
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
|
||||||
Erwartet Antwort im Format: {"url": "..."}
|
Erwartet Antwort im Format: {"url": "..."} – unverändert.
|
||||||
"""
|
"""
|
||||||
base_url = domain if domain.startswith("http") else f"https://{domain}"
|
base_url = _url_with_www(domain)
|
||||||
prompt = (
|
prompt = (
|
||||||
"Analysiere diese Links einer Unternehmensberatung. "
|
"Analysiere diese Links einer Unternehmensberatung. "
|
||||||
"Welcher Link führt zur Seite mit Reports, Insights oder Fachartikeln? "
|
"Welcher Link führt zur Seite mit Reports, Insights oder Fachartikeln? "
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user