From beb80e9eaf86cd408815c321c3606f554657941e Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 31 Jan 2026 18:28:26 +0100 Subject: [PATCH] Refactored timeout handling in scout_logic.py to improve URL fetching reliability, added detailed logging for error tracking, and implemented a total timeout for Playwright operations to prevent indefinite hangs. --- src/scout_logic.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/scout_logic.py b/src/scout_logic.py index f89a90c..c260f05 100644 --- a/src/scout_logic.py +++ b/src/scout_logic.py @@ -2,6 +2,7 @@ Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren. Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection. """ +import asyncio import json import logging import re @@ -9,9 +10,9 @@ from typing import Any from urllib.parse import urlparse import httpx +from playwright.async_api import async_playwright logger = logging.getLogger(__name__) -from playwright.async_api import async_playwright from playwright_stealth import Stealth # OpenRouter Base-URL und Modell @@ -30,6 +31,8 @@ EXTRA_HEADERS = { VIEWPORT = {"width": 1920, "height": 1080} # Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey) GOTO_TIMEOUT = 25_000 # ms +# Harter Gesamt-Timeout für Playwright (falls goto trotzdem hängt, z. B. bei McKinsey) +PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden (reicht für ~3 URLs × 25s) def _normalize_domain(domain: str) -> str: @@ -84,15 +87,16 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di await Stealth().apply_stealth_async(context) page = await context.new_page() - # domcontentloaded statt networkidle: reicht für Links, hängt nicht auf schweren Seiten (Analytics/Ads) + # domcontentloaded; pro URL Timeout, damit wir nicht ewig auf schweren Seiten (McKinsey) hängen async def _goto(url: str) -> bool: try: + logger.info("Playwright: trying %s (timeout %ds)", url, GOTO_TIMEOUT // 1000) await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT) return True - except Exception: + except Exception as e: + logger.warning("Playwright: failed %s: %s", url, type(e).__name__) return False - logger.info("Playwright: loading page for domain=%s", domain) if await _goto(url_insights): logger.info("Playwright: loaded %s", url_insights) elif await _goto(url_primary): @@ -100,7 +104,7 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di elif await _goto(url_fallback): logger.info("Playwright: loaded %s", url_fallback) else: - # Letzter Versuch mit etwas mehr Zeit + logger.warning("Playwright: all URLs failed, last try with 35s") try: await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000) except Exception: @@ -134,6 +138,7 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]: except Exception as e: err_msg = str(e) if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg: + logger.info("Playwright: retry with --disable-http2 for domain=%s (error: %s)", domain, type(e).__name__) return await _fetch_links_impl(domain, disable_http2=True) raise @@ -223,7 +228,13 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic return {"url": None, "error": "OPENROUTER_API_KEY nicht gesetzt"} try: - links = await _fetch_links_with_playwright(domain) + links = await asyncio.wait_for( + _fetch_links_with_playwright(domain), + timeout=PLAYWRIGHT_TOTAL_TIMEOUT, + ) + except asyncio.TimeoutError: + logger.warning("Playwright: total timeout (%.0fs) for domain=%s", PLAYWRIGHT_TOTAL_TIMEOUT, domain) + return {"url": None, "error": f"Playwright-Timeout ({PLAYWRIGHT_TOTAL_TIMEOUT:.0f}s) – Seite antwortet nicht"} except Exception as e: return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}