Refined timeout strategy in scout_logic.py for URL fetching, introducing separate timeouts for 'commit' and 'domcontentloaded' states, and enhanced logging for better error visibility during page loading attempts.
This commit is contained in:
parent
beb80e9eaf
commit
f7b328b7f2
|
|
@ -12,9 +12,10 @@ from urllib.parse import urlparse
|
|||
import httpx
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from playwright_stealth import Stealth
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# OpenRouter Base-URL und Modell
|
||||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
||||
|
|
@ -29,10 +30,11 @@ EXTRA_HEADERS = {
|
|||
"Referer": "https://www.google.com/",
|
||||
}
|
||||
VIEWPORT = {"width": 1920, "height": 1080}
|
||||
# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey)
|
||||
GOTO_TIMEOUT = 25_000 # ms
|
||||
# Harter Gesamt-Timeout für Playwright (falls goto trotzdem hängt, z. B. bei McKinsey)
|
||||
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden (reicht für ~3 URLs × 25s)
|
||||
# Timeouts: commit = Antwort empfangen (schnell), domcontentloaded = DOM bereit (kann bei McKinsey hängen)
|
||||
GOTO_TIMEOUT_COMMIT = 15_000 # ms – erstes Versuchen mit wait_until="commit"
|
||||
GOTO_TIMEOUT_DOM = 25_000 # ms – Fallback domcontentloaded
|
||||
# Harter Gesamt-Timeout für Playwright
|
||||
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden
|
||||
|
||||
|
||||
def _normalize_domain(domain: str) -> str:
|
||||
|
|
@ -87,24 +89,36 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
|||
await Stealth().apply_stealth_async(context)
|
||||
page = await context.new_page()
|
||||
|
||||
# domcontentloaded; pro URL Timeout, damit wir nicht ewig auf schweren Seiten (McKinsey) hängen
|
||||
async def _goto(url: str) -> bool:
|
||||
# Zuerst "commit" (Antwort empfangen) – oft schneller, reicht für HTML mit Links (McKinsey blockiert oft domcontentloaded)
|
||||
async def _goto(url: str, wait: str = "commit", timeout_ms: int = GOTO_TIMEOUT_COMMIT) -> bool:
|
||||
try:
|
||||
logger.info("Playwright: trying %s (timeout %ds)", url, GOTO_TIMEOUT // 1000)
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT)
|
||||
logger.info("Playwright: trying %s (wait=%s, timeout=%ds)", url, wait, timeout_ms // 1000)
|
||||
await page.goto(url, wait_until=wait, timeout=timeout_ms)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("Playwright: failed %s: %s", url, type(e).__name__)
|
||||
err_msg = str(e).strip()[:120] if str(e) else type(e).__name__
|
||||
logger.warning("Playwright: failed %s: %s", url, err_msg)
|
||||
return False
|
||||
|
||||
loaded = False
|
||||
if await _goto(url_insights):
|
||||
logger.info("Playwright: loaded %s", url_insights)
|
||||
elif await _goto(url_primary):
|
||||
logger.info("Playwright: loaded %s", url_primary)
|
||||
elif await _goto(url_fallback):
|
||||
logger.info("Playwright: loaded %s", url_fallback)
|
||||
else:
|
||||
logger.warning("Playwright: all URLs failed, last try with 35s")
|
||||
loaded = True
|
||||
logger.info("Playwright: loaded (commit) %s", url_insights)
|
||||
if not loaded and await _goto(url_primary):
|
||||
loaded = True
|
||||
logger.info("Playwright: loaded (commit) %s", url_primary)
|
||||
if not loaded and await _goto(url_fallback):
|
||||
loaded = True
|
||||
logger.info("Playwright: loaded (commit) %s", url_fallback)
|
||||
# Fallback: domcontentloaded mit längerem Timeout (falls commit zu wenig HTML liefert)
|
||||
if not loaded:
|
||||
logger.info("Playwright: fallback to domcontentloaded (25s)")
|
||||
if await _goto(url_primary, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM):
|
||||
loaded = True
|
||||
elif await _goto(url_fallback, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM):
|
||||
loaded = True
|
||||
if not loaded:
|
||||
logger.warning("Playwright: all URLs failed, last try 35s")
|
||||
try:
|
||||
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
|
||||
except Exception:
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user