Refactored timeout handling in scout_logic.py to improve URL fetching reliability, added detailed logging for error tracking, and implemented a total timeout for Playwright operations to prevent indefinite hangs.
This commit is contained in:
parent
b3e9a6455b
commit
beb80e9eaf
|
|
@ -2,6 +2,7 @@
|
||||||
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
|
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
|
||||||
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
|
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
|
||||||
"""
|
"""
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
@ -9,9 +10,9 @@ from typing import Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
from playwright_stealth import Stealth
|
from playwright_stealth import Stealth
|
||||||
|
|
||||||
# OpenRouter Base-URL und Modell
|
# OpenRouter Base-URL und Modell
|
||||||
|
|
@ -30,6 +31,8 @@ EXTRA_HEADERS = {
|
||||||
VIEWPORT = {"width": 1920, "height": 1080}
|
VIEWPORT = {"width": 1920, "height": 1080}
|
||||||
# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey)
|
# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey)
|
||||||
GOTO_TIMEOUT = 25_000 # ms
|
GOTO_TIMEOUT = 25_000 # ms
|
||||||
|
# Harter Gesamt-Timeout für Playwright (falls goto trotzdem hängt, z. B. bei McKinsey)
|
||||||
|
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden (reicht für ~3 URLs × 25s)
|
||||||
|
|
||||||
|
|
||||||
def _normalize_domain(domain: str) -> str:
|
def _normalize_domain(domain: str) -> str:
|
||||||
|
|
@ -84,15 +87,16 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
||||||
await Stealth().apply_stealth_async(context)
|
await Stealth().apply_stealth_async(context)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
# domcontentloaded statt networkidle: reicht für Links, hängt nicht auf schweren Seiten (Analytics/Ads)
|
# domcontentloaded; pro URL Timeout, damit wir nicht ewig auf schweren Seiten (McKinsey) hängen
|
||||||
async def _goto(url: str) -> bool:
|
async def _goto(url: str) -> bool:
|
||||||
try:
|
try:
|
||||||
|
logger.info("Playwright: trying %s (timeout %ds)", url, GOTO_TIMEOUT // 1000)
|
||||||
await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT)
|
await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT)
|
||||||
return True
|
return True
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logger.warning("Playwright: failed %s: %s", url, type(e).__name__)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
logger.info("Playwright: loading page for domain=%s", domain)
|
|
||||||
if await _goto(url_insights):
|
if await _goto(url_insights):
|
||||||
logger.info("Playwright: loaded %s", url_insights)
|
logger.info("Playwright: loaded %s", url_insights)
|
||||||
elif await _goto(url_primary):
|
elif await _goto(url_primary):
|
||||||
|
|
@ -100,7 +104,7 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
||||||
elif await _goto(url_fallback):
|
elif await _goto(url_fallback):
|
||||||
logger.info("Playwright: loaded %s", url_fallback)
|
logger.info("Playwright: loaded %s", url_fallback)
|
||||||
else:
|
else:
|
||||||
# Letzter Versuch mit etwas mehr Zeit
|
logger.warning("Playwright: all URLs failed, last try with 35s")
|
||||||
try:
|
try:
|
||||||
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
|
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
@ -134,6 +138,7 @@ async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
err_msg = str(e)
|
err_msg = str(e)
|
||||||
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
|
if "ERR_HTTP2_PROTOCOL_ERROR" in err_msg or "net::ERR_" in err_msg:
|
||||||
|
logger.info("Playwright: retry with --disable-http2 for domain=%s (error: %s)", domain, type(e).__name__)
|
||||||
return await _fetch_links_impl(domain, disable_http2=True)
|
return await _fetch_links_impl(domain, disable_http2=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
@ -223,7 +228,13 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic
|
||||||
return {"url": None, "error": "OPENROUTER_API_KEY nicht gesetzt"}
|
return {"url": None, "error": "OPENROUTER_API_KEY nicht gesetzt"}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
links = await _fetch_links_with_playwright(domain)
|
links = await asyncio.wait_for(
|
||||||
|
_fetch_links_with_playwright(domain),
|
||||||
|
timeout=PLAYWRIGHT_TOTAL_TIMEOUT,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Playwright: total timeout (%.0fs) for domain=%s", PLAYWRIGHT_TOTAL_TIMEOUT, domain)
|
||||||
|
return {"url": None, "error": f"Playwright-Timeout ({PLAYWRIGHT_TOTAL_TIMEOUT:.0f}s) – Seite antwortet nicht"}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
|
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user