Refined timeout strategy in scout_logic.py for URL fetching, introducing separate timeouts for 'commit' and 'domcontentloaded' states, and enhanced logging for better error visibility during page loading attempts.

This commit is contained in:
Lars 2026-01-31 18:33:44 +01:00
parent beb80e9eaf
commit f7b328b7f2

View File

@ -12,9 +12,10 @@ from urllib.parse import urlparse
import httpx import httpx
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
logger = logging.getLogger(__name__)
from playwright_stealth import Stealth from playwright_stealth import Stealth
logger = logging.getLogger(__name__)
# OpenRouter Base-URL und Modell # OpenRouter Base-URL und Modell
OPENROUTER_BASE = "https://openrouter.ai/api/v1" OPENROUTER_BASE = "https://openrouter.ai/api/v1"
DEFAULT_MODEL = "google/gemini-flash-1.5-8b" DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
@ -29,10 +30,11 @@ EXTRA_HEADERS = {
"Referer": "https://www.google.com/", "Referer": "https://www.google.com/",
} }
VIEWPORT = {"width": 1920, "height": 1080} VIEWPORT = {"width": 1920, "height": 1080}
# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey) # Timeouts: commit = Antwort empfangen (schnell), domcontentloaded = DOM bereit (kann bei McKinsey hängen)
GOTO_TIMEOUT = 25_000 # ms GOTO_TIMEOUT_COMMIT = 15_000 # ms erstes Versuchen mit wait_until="commit"
# Harter Gesamt-Timeout für Playwright (falls goto trotzdem hängt, z. B. bei McKinsey) GOTO_TIMEOUT_DOM = 25_000 # ms Fallback domcontentloaded
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden (reicht für ~3 URLs × 25s) # Harter Gesamt-Timeout für Playwright
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden
def _normalize_domain(domain: str) -> str: def _normalize_domain(domain: str) -> str:
@ -87,24 +89,36 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
await Stealth().apply_stealth_async(context) await Stealth().apply_stealth_async(context)
page = await context.new_page() page = await context.new_page()
# domcontentloaded; pro URL Timeout, damit wir nicht ewig auf schweren Seiten (McKinsey) hängen # Zuerst "commit" (Antwort empfangen) oft schneller, reicht für HTML mit Links (McKinsey blockiert oft domcontentloaded)
async def _goto(url: str) -> bool: async def _goto(url: str, wait: str = "commit", timeout_ms: int = GOTO_TIMEOUT_COMMIT) -> bool:
try: try:
logger.info("Playwright: trying %s (timeout %ds)", url, GOTO_TIMEOUT // 1000) logger.info("Playwright: trying %s (wait=%s, timeout=%ds)", url, wait, timeout_ms // 1000)
await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT) await page.goto(url, wait_until=wait, timeout=timeout_ms)
return True return True
except Exception as e: except Exception as e:
logger.warning("Playwright: failed %s: %s", url, type(e).__name__) err_msg = str(e).strip()[:120] if str(e) else type(e).__name__
logger.warning("Playwright: failed %s: %s", url, err_msg)
return False return False
loaded = False
if await _goto(url_insights): if await _goto(url_insights):
logger.info("Playwright: loaded %s", url_insights) loaded = True
elif await _goto(url_primary): logger.info("Playwright: loaded (commit) %s", url_insights)
logger.info("Playwright: loaded %s", url_primary) if not loaded and await _goto(url_primary):
elif await _goto(url_fallback): loaded = True
logger.info("Playwright: loaded %s", url_fallback) logger.info("Playwright: loaded (commit) %s", url_primary)
else: if not loaded and await _goto(url_fallback):
logger.warning("Playwright: all URLs failed, last try with 35s") loaded = True
logger.info("Playwright: loaded (commit) %s", url_fallback)
# Fallback: domcontentloaded mit längerem Timeout (falls commit zu wenig HTML liefert)
if not loaded:
logger.info("Playwright: fallback to domcontentloaded (25s)")
if await _goto(url_primary, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM):
loaded = True
elif await _goto(url_fallback, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM):
loaded = True
if not loaded:
logger.warning("Playwright: all URLs failed, last try 35s")
try: try:
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000) await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
except Exception: except Exception: