From 652261b7744eae08299cce4c3bfea7b7f17ce964 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 31 Jan 2026 18:42:42 +0100 Subject: [PATCH] Added support for known hub URLs in scout_logic.py to handle Playwright timeouts and errors more effectively. Updated fetching logic to prioritize known URLs when encountering issues, enhancing reliability in link extraction. --- src/scout_logic.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/scout_logic.py b/src/scout_logic.py index 97a1123..54b8851 100644 --- a/src/scout_logic.py +++ b/src/scout_logic.py @@ -41,6 +41,11 @@ GOTO_TIMEOUT_DOM = 25_000 # ms – Fallback domcontentloaded # Harter Gesamt-Timeout für Playwright PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden +# Bekannte Hub-URLs für Domains, die Playwright stark blockieren (Fallback bei Timeout/Fehler) +KNOWN_HUB_URLS: dict[str, str] = { + "mckinsey.com": "https://www.mckinsey.com/mgi/our-research/all-research", +} + def _normalize_domain(domain: str) -> str: """Extrahiert die reine Domain (Host) ohne Schema und Pfad.""" @@ -150,15 +155,16 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]: """ Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert -Tags. - Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2. + McKinsey u. a. liefern ERR_HTTP2 ohne --disable-http2; daher zuerst mit --disable-http2 starten. """ + # Zuerst mit --disable-http2 (vermeidet verschwendeten ersten Lauf bei ERR_HTTP2) try: - return await _fetch_links_impl(domain, disable_http2=False) + return await _fetch_links_impl(domain, disable_http2=True) except Exception as e: err_msg = str(e) if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg: - logger.info("Playwright: retry with --disable-http2 for domain=%s (error: %s)", domain, type(e).__name__) - return await _fetch_links_impl(domain, disable_http2=True) + logger.info("Playwright: retry without --disable-http2 for domain=%s", domain) + return await _fetch_links_impl(domain, disable_http2=False) raise @@ -257,11 +263,23 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic ) except asyncio.TimeoutError: logger.warning("Playwright: total timeout (%.0fs) for domain=%s", PLAYWRIGHT_TOTAL_TIMEOUT, domain) + known = KNOWN_HUB_URLS.get(_normalize_domain(domain)) + if known: + logger.info("Scout: using known hub URL for %s: %s", domain, known) + return {"url": known, "error": None} return {"url": None, "error": f"Playwright-Timeout ({PLAYWRIGHT_TOTAL_TIMEOUT:.0f}s) – Seite antwortet nicht"} except Exception as e: + known = KNOWN_HUB_URLS.get(_normalize_domain(domain)) + if known: + logger.info("Scout: Playwright failed for %s, using known hub URL: %s", domain, known) + return {"url": known, "error": None} return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"} if not links: + known = KNOWN_HUB_URLS.get(_normalize_domain(domain)) + if known: + logger.info("Scout: no links for %s, using known hub URL: %s", domain, known) + return {"url": known, "error": None} return {"url": None, "error": "Keine Links auf der Seite gefunden"} logger.info("Scout: calling OpenRouter for domain=%s", domain)