Refined timeout strategy in scout_logic.py for URL fetching, introducing separate timeouts for 'commit' and 'domcontentloaded' states, and enhanced logging for better error visibility during page loading attempts.
This commit is contained in:
parent
beb80e9eaf
commit
f7b328b7f2
|
|
@ -12,9 +12,10 @@ from urllib.parse import urlparse
|
||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
from playwright_stealth import Stealth
|
from playwright_stealth import Stealth
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# OpenRouter Base-URL und Modell
|
# OpenRouter Base-URL und Modell
|
||||||
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||||
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
||||||
|
|
@ -29,10 +30,11 @@ EXTRA_HEADERS = {
|
||||||
"Referer": "https://www.google.com/",
|
"Referer": "https://www.google.com/",
|
||||||
}
|
}
|
||||||
VIEWPORT = {"width": 1920, "height": 1080}
|
VIEWPORT = {"width": 1920, "height": 1080}
|
||||||
# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey)
|
# Timeouts: commit = Antwort empfangen (schnell), domcontentloaded = DOM bereit (kann bei McKinsey hängen)
|
||||||
GOTO_TIMEOUT = 25_000 # ms
|
GOTO_TIMEOUT_COMMIT = 15_000 # ms – erstes Versuchen mit wait_until="commit"
|
||||||
# Harter Gesamt-Timeout für Playwright (falls goto trotzdem hängt, z. B. bei McKinsey)
|
GOTO_TIMEOUT_DOM = 25_000 # ms – Fallback domcontentloaded
|
||||||
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden (reicht für ~3 URLs × 25s)
|
# Harter Gesamt-Timeout für Playwright
|
||||||
|
PLAYWRIGHT_TOTAL_TIMEOUT = 80.0 # Sekunden
|
||||||
|
|
||||||
|
|
||||||
def _normalize_domain(domain: str) -> str:
|
def _normalize_domain(domain: str) -> str:
|
||||||
|
|
@ -87,24 +89,36 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
||||||
await Stealth().apply_stealth_async(context)
|
await Stealth().apply_stealth_async(context)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
# domcontentloaded; pro URL Timeout, damit wir nicht ewig auf schweren Seiten (McKinsey) hängen
|
# Zuerst "commit" (Antwort empfangen) – oft schneller, reicht für HTML mit Links (McKinsey blockiert oft domcontentloaded)
|
||||||
async def _goto(url: str) -> bool:
|
async def _goto(url: str, wait: str = "commit", timeout_ms: int = GOTO_TIMEOUT_COMMIT) -> bool:
|
||||||
try:
|
try:
|
||||||
logger.info("Playwright: trying %s (timeout %ds)", url, GOTO_TIMEOUT // 1000)
|
logger.info("Playwright: trying %s (wait=%s, timeout=%ds)", url, wait, timeout_ms // 1000)
|
||||||
await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT)
|
await page.goto(url, wait_until=wait, timeout=timeout_ms)
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Playwright: failed %s: %s", url, type(e).__name__)
|
err_msg = str(e).strip()[:120] if str(e) else type(e).__name__
|
||||||
|
logger.warning("Playwright: failed %s: %s", url, err_msg)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
loaded = False
|
||||||
if await _goto(url_insights):
|
if await _goto(url_insights):
|
||||||
logger.info("Playwright: loaded %s", url_insights)
|
loaded = True
|
||||||
elif await _goto(url_primary):
|
logger.info("Playwright: loaded (commit) %s", url_insights)
|
||||||
logger.info("Playwright: loaded %s", url_primary)
|
if not loaded and await _goto(url_primary):
|
||||||
elif await _goto(url_fallback):
|
loaded = True
|
||||||
logger.info("Playwright: loaded %s", url_fallback)
|
logger.info("Playwright: loaded (commit) %s", url_primary)
|
||||||
else:
|
if not loaded and await _goto(url_fallback):
|
||||||
logger.warning("Playwright: all URLs failed, last try with 35s")
|
loaded = True
|
||||||
|
logger.info("Playwright: loaded (commit) %s", url_fallback)
|
||||||
|
# Fallback: domcontentloaded mit längerem Timeout (falls commit zu wenig HTML liefert)
|
||||||
|
if not loaded:
|
||||||
|
logger.info("Playwright: fallback to domcontentloaded (25s)")
|
||||||
|
if await _goto(url_primary, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM):
|
||||||
|
loaded = True
|
||||||
|
elif await _goto(url_fallback, wait="domcontentloaded", timeout_ms=GOTO_TIMEOUT_DOM):
|
||||||
|
loaded = True
|
||||||
|
if not loaded:
|
||||||
|
logger.warning("Playwright: all URLs failed, last try 35s")
|
||||||
try:
|
try:
|
||||||
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
|
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user