Enhanced main.py and scout_logic.py with improved timeout handling for URL fetching, added logging for better request tracking, and optimized page loading strategy to prevent hangs on heavy pages.

This commit is contained in:
Lars 2026-01-31 18:25:23 +01:00
parent 46b59d2c5c
commit b3e9a6455b
2 changed files with 36 additions and 37 deletions

View File

@ -1,6 +1,7 @@
""" """
Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain. Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain.
""" """
import asyncio
import logging import logging
import os import os
@ -56,7 +57,13 @@ async def discover(body: DiscoverRequest) -> DiscoverResponse:
detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)", detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)",
) )
result = await get_publication_url(body.domain) # Gesamt-Timeout (Playwright + OpenRouter), damit der Client nicht ewig wartet
try:
result = await asyncio.wait_for(get_publication_url(body.domain), timeout=90.0)
except asyncio.TimeoutError:
logger.warning("POST /discover timeout: domain=%s", body.domain)
return DiscoverResponse(url=None, error="Timeout (90s) Seite oder OpenRouter zu langsam")
logger.info("POST /discover done: domain=%s url=%s error=%s", body.domain, result.get("url"), result.get("error")) logger.info("POST /discover done: domain=%s url=%s error=%s", body.domain, result.get("url"), result.get("error"))
return DiscoverResponse(url=result["url"], error=result.get("error")) return DiscoverResponse(url=result["url"], error=result.get("error"))

View File

@ -3,11 +3,14 @@ Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection. Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
""" """
import json import json
import logging
import re import re
from typing import Any from typing import Any
from urllib.parse import urlparse from urllib.parse import urlparse
import httpx import httpx
logger = logging.getLogger(__name__)
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from playwright_stealth import Stealth from playwright_stealth import Stealth
@ -25,7 +28,8 @@ EXTRA_HEADERS = {
"Referer": "https://www.google.com/", "Referer": "https://www.google.com/",
} }
VIEWPORT = {"width": 1920, "height": 1080} VIEWPORT = {"width": 1920, "height": 1080}
GOTO_TIMEOUT = 60_000 # ms # Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey)
GOTO_TIMEOUT = 25_000 # ms
def _normalize_domain(domain: str) -> str: def _normalize_domain(domain: str) -> str:
@ -80,43 +84,27 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
await Stealth().apply_stealth_async(context) await Stealth().apply_stealth_async(context)
page = await context.new_page() page = await context.new_page()
# Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights) # domcontentloaded statt networkidle: reicht für Links, hängt nicht auf schweren Seiten (Analytics/Ads)
try: async def _goto(url: str) -> bool:
await page.goto(
url_insights,
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception:
# Versuch 2: Startseite mit www
try: try:
await page.goto( await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT)
url_primary, return True
wait_until="networkidle",
timeout=GOTO_TIMEOUT,
)
except Exception: except Exception:
# Versuch 3: ohne www return False
try:
await page.goto( logger.info("Playwright: loading page for domain=%s", domain)
url_fallback, if await _goto(url_insights):
wait_until="networkidle", logger.info("Playwright: loaded %s", url_insights)
timeout=GOTO_TIMEOUT, elif await _goto(url_primary):
) logger.info("Playwright: loaded %s", url_primary)
except Exception: elif await _goto(url_fallback):
# Versuch 4: domcontentloaded als letzter Fallback logger.info("Playwright: loaded %s", url_fallback)
try: else:
await page.goto( # Letzter Versuch mit etwas mehr Zeit
url_primary, try:
wait_until="domcontentloaded", await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
timeout=30_000, except Exception:
) await page.goto(url_fallback, wait_until="domcontentloaded", timeout=35_000)
except Exception:
await page.goto(
url_fallback,
wait_until="domcontentloaded",
timeout=30_000,
)
# Alle <a href="..."> auslesen # Alle <a href="..."> auslesen
links = await page.evaluate( links = await page.evaluate(
@ -128,6 +116,7 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
})).filter(x => x.href); })).filter(x => x.href);
}""" }"""
) )
logger.info("Playwright: got %d links for domain=%s", len(links), domain)
await context.close() await context.close()
finally: finally:
await browser.close() await browser.close()
@ -167,6 +156,7 @@ async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an. Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
Erwartet Antwort im Format: {"url": "..."} unverändert. Erwartet Antwort im Format: {"url": "..."} unverändert.
""" """
logger.info("OpenRouter: sending request for %d links (domain=%s)", len(links), domain)
base_url = _url_with_www(domain) base_url = _url_with_www(domain)
prompt = ( prompt = (
"Analysiere diese Links einer Unternehmensberatung. " "Analysiere diese Links einer Unternehmensberatung. "
@ -197,6 +187,7 @@ async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str
}, },
) )
resp.raise_for_status() resp.raise_for_status()
logger.info("OpenRouter: got response for domain=%s", domain)
data = resp.json() data = resp.json()
choice = (data.get("choices") or [None])[0] choice = (data.get("choices") or [None])[0]
if not choice: if not choice:
@ -239,6 +230,7 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic
if not links: if not links:
return {"url": None, "error": "Keine Links auf der Seite gefunden"} return {"url": None, "error": "Keine Links auf der Seite gefunden"}
logger.info("Scout: calling OpenRouter for domain=%s", domain)
try: try:
url = await _ask_openrouter(key, links, domain) url = await _ask_openrouter(key, links, domain)
return {"url": url, "error": None} return {"url": url, "error": None}