diff --git a/src/scout_logic.py b/src/scout_logic.py index c260f05..af0fd70 100644 --- a/src/scout_logic.py +++ b/src/scout_logic.py @@ -12,9 +12,10 @@ from urllib.parse import urlparse import httpx from playwright.async_api import async_playwright -logger = logging.getLogger(__name__) from playwright_stealth import Stealth +logger = logging.getLogger(__name__) + # OpenRouter Base-URL und Modell OPENROUTER_BASE = "https://openrouter.ai/api/v1" DEFAULT_MODEL = "google/gemini-flash-1.5-8b" @@ -29,10 +30,11 @@ EXTRA_HEADERS = { "Referer": "https://www.google.com/", } VIEWPORT = {"width": 1920, "height": 1080} -# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey) -GOTO_TIMEOUT = 25_000 # ms -# Harter Gesamt-Timeout für Playwright (falls goto trotzdem hängt, z. B. bei McKinsey) -PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden (reicht für ~3 URLs × 25s) +# Timeouts: commit = Antwort empfangen (schnell), domcontentloaded = DOM bereit (kann bei McKinsey hängen) +GOTO_TIMEOUT_COMMIT = 15_000 # ms – erstes Versuchen mit wait_until="commit" +GOTO_TIMEOUT_DOM = 25_000 # ms – Fallback domcontentloaded +# Harter Gesamt-Timeout für Playwright +PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden def _normalize_domain(domain: str) -> str: @@ -87,24 +89,36 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di await Stealth().apply_stealth_async(context) page = await context.new_page() - # domcontentloaded; pro URL Timeout, damit wir nicht ewig auf schweren Seiten (McKinsey) hängen - async def _goto(url: str) -> bool: + # Zuerst "commit" (Antwort empfangen) – oft schneller, reicht für HTML mit Links (McKinsey blockiert oft domcontentloaded) + async def _goto(url: str, wait: str = "commit", timeout_ms: int = GOTO_TIMEOUT_COMMIT) -> bool: try: - logger.info("Playwright: trying %s (timeout %ds)", url, GOTO_TIMEOUT // 1000) - await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT) + logger.info("Playwright: trying %s (wait=%s, timeout=%ds)", url, wait, timeout_ms // 1000) + await page.goto(url, wait_until=wait, timeout=timeout_ms) return True except Exception as e: - logger.warning("Playwright: failed %s: %s", url, type(e).__name__) + err_msg = str(e).strip()[:120] if str(e) else type(e).__name__ + logger.warning("Playwright: failed %s: %s", url, err_msg) return False + loaded = False if await _goto(url_insights): - logger.info("Playwright: loaded %s", url_insights) - elif await _goto(url_primary): - logger.info("Playwright: loaded %s", url_primary) - elif await _goto(url_fallback): - logger.info("Playwright: loaded %s", url_fallback) - else: - logger.warning("Playwright: all URLs failed, last try with 35s") + loaded = True + logger.info("Playwright: loaded (commit) %s", url_insights) + if not loaded and await _goto(url_primary): + loaded = True + logger.info("Playwright: loaded (commit) %s", url_primary) + if not loaded and await _goto(url_fallback): + loaded = True + logger.info("Playwright: loaded (commit) %s", url_fallback) + # Fallback: domcontentloaded mit längerem Timeout (falls commit zu wenig HTML liefert) + if not loaded: + logger.info("Playwright: fallback to domcontentloaded (25s)") + if await _goto(url_primary, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM): + loaded = True + elif await _goto(url_fallback, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM): + loaded = True + if not loaded: + logger.warning("Playwright: all URLs failed, last try 35s") try: await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000) except Exception: