Added support for known hub URLs in scout_logic.py to handle Playwright timeouts and errors more effectively. Updated fetching logic to prioritize known URLs when encountering issues, enhancing reliability in link extraction.
This commit is contained in:
parent
8000642eae
commit
652261b774
|
|
@ -41,6 +41,11 @@ GOTO_TIMEOUT_DOM = 25_000 # ms – Fallback domcontentloaded
|
||||||
# Harter Gesamt-Timeout für Playwright
|
# Harter Gesamt-Timeout für Playwright
|
||||||
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden
|
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden
|
||||||
|
|
||||||
|
# Bekannte Hub-URLs für Domains, die Playwright stark blockieren (Fallback bei Timeout/Fehler)
|
||||||
|
KNOWN_HUB_URLS: dict[str, str] = {
|
||||||
|
"mckinsey.com": "https://www.mckinsey.com/mgi/our-research/all-research",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _normalize_domain(domain: str) -> str:
|
def _normalize_domain(domain: str) -> str:
|
||||||
"""Extrahiert die reine Domain (Host) ohne Schema und Pfad."""
|
"""Extrahiert die reine Domain (Host) ohne Schema und Pfad."""
|
||||||
|
|
@ -150,15 +155,16 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
||||||
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
"""
|
"""
|
||||||
Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert <a>-Tags.
|
Lädt die Domain mit Playwright (Stealth, Kontext gegen Bot-Detection), extrahiert <a>-Tags.
|
||||||
Bei ERR_HTTP2_PROTOCOL_ERROR: erneuter Versuch mit --disable-http2.
|
McKinsey u. a. liefern ERR_HTTP2 ohne --disable-http2; daher zuerst mit --disable-http2 starten.
|
||||||
"""
|
"""
|
||||||
|
# Zuerst mit --disable-http2 (vermeidet verschwendeten ersten Lauf bei ERR_HTTP2)
|
||||||
try:
|
try:
|
||||||
return await _fetch_links_impl(domain, disable_http2=False)
|
return await _fetch_links_impl(domain, disable_http2=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
err_msg = str(e)
|
err_msg = str(e)
|
||||||
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
|
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
|
||||||
logger.info("Playwright: retry with --disable-http2 for domain=%s (error: %s)", domain, type(e).__name__)
|
logger.info("Playwright: retry without --disable-http2 for domain=%s", domain)
|
||||||
return await _fetch_links_impl(domain, disable_http2=True)
|
return await _fetch_links_impl(domain, disable_http2=False)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -257,11 +263,23 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic
|
||||||
)
|
)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.warning("Playwright: total timeout (%.0fs) for domain=%s", PLAYWRIGHT_TOTAL_TIMEOUT, domain)
|
logger.warning("Playwright: total timeout (%.0fs) for domain=%s", PLAYWRIGHT_TOTAL_TIMEOUT, domain)
|
||||||
|
known = KNOWN_HUB_URLS.get(_normalize_domain(domain))
|
||||||
|
if known:
|
||||||
|
logger.info("Scout: using known hub URL for %s: %s", domain, known)
|
||||||
|
return {"url": known, "error": None}
|
||||||
return {"url": None, "error": f"Playwright-Timeout ({PLAYWRIGHT_TOTAL_TIMEOUT:.0f}s) – Seite antwortet nicht"}
|
return {"url": None, "error": f"Playwright-Timeout ({PLAYWRIGHT_TOTAL_TIMEOUT:.0f}s) – Seite antwortet nicht"}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
known = KNOWN_HUB_URLS.get(_normalize_domain(domain))
|
||||||
|
if known:
|
||||||
|
logger.info("Scout: Playwright failed for %s, using known hub URL: %s", domain, known)
|
||||||
|
return {"url": known, "error": None}
|
||||||
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
|
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
|
||||||
|
|
||||||
if not links:
|
if not links:
|
||||||
|
known = KNOWN_HUB_URLS.get(_normalize_domain(domain))
|
||||||
|
if known:
|
||||||
|
logger.info("Scout: no links for %s, using known hub URL: %s", domain, known)
|
||||||
|
return {"url": known, "error": None}
|
||||||
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
|
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
|
||||||
|
|
||||||
logger.info("Scout: calling OpenRouter for domain=%s", domain)
|
logger.info("Scout: calling OpenRouter for domain=%s", domain)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user