Added support for known hub URLs in scout_logic.py to handle Playwright timeouts and errors more effectively. Updated fetching logic to prioritize known URLs when encountering issues, enhancing reliability in link extraction.

This commit is contained in:
Lars 2026-01-31 18:42:42 +01:00
parent 8000642eae
commit 652261b774

View File

@ -41,6 +41,11 @@ GOTO_TIMEOUT_DOM = 25_000 # ms Fallback domcontentloaded
# Harter Gesamt-Timeout für Playwright
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden
# Bekannte Hub-URLs für Domains, die Playwright stark blockieren (Fallback bei Timeout/Fehler)
KNOWN_HUB_URLS: dict[str, str] = {
"mckinsey.com": "https://www.mckinsey.com/mgi/our-research/all-research",
}
def _normalize_domain(domain: str) -> str:
"""Extrahiert die reine Domain (Host) ohne Schema und Pfad."""
@ -150,15 +155,16 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
"""
Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert <a>-Tags.
Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2.
McKinsey u. a. liefern ERR_HTTP2 ohne --disable-http2; daher zuerst mit --disable-http2 starten.
"""
# Zuerst mit --disable-http2 (vermeidet verschwendeten ersten Lauf bei ERR_HTTP2)
try:
return await _fetch_links_impl(domain, disable_http2=False)
return await _fetch_links_impl(domain, disable_http2=True)
except Exception as e:
err_msg = str(e)
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
logger.info("Playwright: retry with --disable-http2 for domain=%s (error: %s)", domain, type(e).__name__)
return await _fetch_links_impl(domain, disable_http2=True)
logger.info("Playwright: retry without --disable-http2 for domain=%s", domain)
return await _fetch_links_impl(domain, disable_http2=False)
raise
@ -257,11 +263,23 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic
)
except asyncio.TimeoutError:
logger.warning("Playwright: total timeout (%.0fs) for domain=%s", PLAYWRIGHT_TOTAL_TIMEOUT, domain)
known = KNOWN_HUB_URLS.get(_normalize_domain(domain))
if known:
logger.info("Scout: using known hub URL for %s: %s", domain, known)
return {"url": known, "error": None}
return {"url": None, "error": f"Playwright-Timeout ({PLAYWRIGHT_TOTAL_TIMEOUT:.0f}s) Seite antwortet nicht"}
except Exception as e:
known = KNOWN_HUB_URLS.get(_normalize_domain(domain))
if known:
logger.info("Scout: Playwright failed for %s, using known hub URL: %s", domain, known)
return {"url": known, "error": None}
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
if not links:
known = KNOWN_HUB_URLS.get(_normalize_domain(domain))
if known:
logger.info("Scout: no links for %s, using known hub URL: %s", domain, known)
return {"url": known, "error": None}
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
logger.info("Scout: calling OpenRouter for domain=%s", domain)