From b3e9a6455b7ac8c81b219eb933a439a96e0586fb Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 31 Jan 2026 18:25:23 +0100 Subject: [PATCH] Enhanced main.py and scout_logic.py with improved timeout handling for URL fetching, added logging for better request tracking, and optimized page loading strategy to prevent hangs on heavy pages. --- src/main.py | 9 ++++++- src/scout_logic.py | 64 ++++++++++++++++++++-------------------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/src/main.py b/src/main.py index b145b8a..0803ab9 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,7 @@ """ Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain. """ +import asyncio import logging import os @@ -56,7 +57,13 @@ async def discover(body: DiscoverRequest) -> DiscoverResponse: detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)", ) - result = await get_publication_url(body.domain) + # Gesamt-Timeout (Playwright + OpenRouter), damit der Client nicht ewig wartet + try: + result = await asyncio.wait_for(get_publication_url(body.domain), timeout=90.0) + except asyncio.TimeoutError: + logger.warning("POST /discover timeout: domain=%s", body.domain) + return DiscoverResponse(url=None, error="Timeout (90s) – Seite oder OpenRouter zu langsam") + logger.info("POST /discover done: domain=%s url=%s error=%s", body.domain, result.get("url"), result.get("error")) return DiscoverResponse(url=result["url"], error=result.get("error")) diff --git a/src/scout_logic.py b/src/scout_logic.py index 3cd77ac..f89a90c 100644 --- a/src/scout_logic.py +++ b/src/scout_logic.py @@ -3,11 +3,14 @@ Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection. """ import json +import logging import re from typing import Any from urllib.parse import urlparse import httpx + +logger = logging.getLogger(__name__) from playwright.async_api import async_playwright from playwright_stealth import Stealth @@ -25,7 +28,8 @@ EXTRA_HEADERS = { "Referer": "https://www.google.com/", } VIEWPORT = {"width": 1920, "height": 1080} -GOTO_TIMEOUT = 60_000 # ms +# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey) +GOTO_TIMEOUT = 25_000 # ms def _normalize_domain(domain: str) -> str: @@ -80,43 +84,27 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di await Stealth().apply_stealth_async(context) page = await context.new_page() - # Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights) - try: - await page.goto( - url_insights, - wait_until="networkidle", - timeout=GOTO_TIMEOUT, - ) - except Exception: - # Versuch 2: Startseite mit www + # domcontentloaded statt networkidle: reicht für Links, hängt nicht auf schweren Seiten (Analytics/Ads) + async def _goto(url: str) -> bool: try: - await page.goto( - url_primary, - wait_until="networkidle", - timeout=GOTO_TIMEOUT, - ) + await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT) + return True except Exception: - # Versuch 3: ohne www - try: - await page.goto( - url_fallback, - wait_until="networkidle", - timeout=GOTO_TIMEOUT, - ) - except Exception: - # Versuch 4: domcontentloaded als letzter Fallback - try: - await page.goto( - url_primary, - wait_until="domcontentloaded", - timeout=30_000, - ) - except Exception: - await page.goto( - url_fallback, - wait_until="domcontentloaded", - timeout=30_000, - ) + return False + + logger.info("Playwright: loading page for domain=%s", domain) + if await _goto(url_insights): + logger.info("Playwright: loaded %s", url_insights) + elif await _goto(url_primary): + logger.info("Playwright: loaded %s", url_primary) + elif await _goto(url_fallback): + logger.info("Playwright: loaded %s", url_fallback) + else: + # Letzter Versuch mit etwas mehr Zeit + try: + await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000) + except Exception: + await page.goto(url_fallback, wait_until="domcontentloaded", timeout=35_000) # Alle auslesen links = await page.evaluate( @@ -128,6 +116,7 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di })).filter(x => x.href); }""" ) + logger.info("Playwright: got %d links for domain=%s", len(links), domain) await context.close() finally: await browser.close() @@ -167,6 +156,7 @@ async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an. Erwartet Antwort im Format: {"url": "..."} – unverändert. """ + logger.info("OpenRouter: sending request for %d links (domain=%s)", len(links), domain) base_url = _url_with_www(domain) prompt = ( "Analysiere diese Links einer Unternehmensberatung. " @@ -197,6 +187,7 @@ async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str }, ) resp.raise_for_status() + logger.info("OpenRouter: got response for domain=%s", domain) data = resp.json() choice = (data.get("choices") or [None])[0] if not choice: @@ -239,6 +230,7 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic if not links: return {"url": None, "error": "Keine Links auf der Seite gefunden"} + logger.info("Scout: calling OpenRouter for domain=%s", domain) try: url = await _ask_openrouter(key, links, domain) return {"url": url, "error": None}