Enhanced main.py and scout_logic.py with improved timeout handling for URL fetching, added logging for better request tracking, and optimized page loading strategy to prevent hangs on heavy pages.
This commit is contained in:
parent
46b59d2c5c
commit
b3e9a6455b
|
|
@ -1,6 +1,7 @@
|
||||||
"""
|
"""
|
||||||
Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain.
|
Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain.
|
||||||
"""
|
"""
|
||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
@ -56,7 +57,13 @@ async def discover(body: DiscoverRequest) -> DiscoverResponse:
|
||||||
detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)",
|
detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)",
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await get_publication_url(body.domain)
|
# Gesamt-Timeout (Playwright + OpenRouter), damit der Client nicht ewig wartet
|
||||||
|
try:
|
||||||
|
result = await asyncio.wait_for(get_publication_url(body.domain), timeout=90.0)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("POST /discover timeout: domain=%s", body.domain)
|
||||||
|
return DiscoverResponse(url=None, error="Timeout (90s) – Seite oder OpenRouter zu langsam")
|
||||||
|
|
||||||
logger.info("POST /discover done: domain=%s url=%s error=%s", body.domain, result.get("url"), result.get("error"))
|
logger.info("POST /discover done: domain=%s url=%s error=%s", body.domain, result.get("url"), result.get("error"))
|
||||||
return DiscoverResponse(url=result["url"], error=result.get("error"))
|
return DiscoverResponse(url=result["url"], error=result.get("error"))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,14 @@ Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL
|
||||||
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
|
Mit Stealth und HTTP/2-Fallback gegen extreme Bot-Detection.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
from playwright_stealth import Stealth
|
from playwright_stealth import Stealth
|
||||||
|
|
||||||
|
|
@ -25,7 +28,8 @@ EXTRA_HEADERS = {
|
||||||
"Referer": "https://www.google.com/",
|
"Referer": "https://www.google.com/",
|
||||||
}
|
}
|
||||||
VIEWPORT = {"width": 1920, "height": 1080}
|
VIEWPORT = {"width": 1920, "height": 1080}
|
||||||
GOTO_TIMEOUT = 60_000 # ms
|
# Kürzerer Timeout: domcontentloaded reicht für Link-Extraktion; networkidle hängt auf schweren Seiten (McKinsey)
|
||||||
|
GOTO_TIMEOUT = 25_000 # ms
|
||||||
|
|
||||||
|
|
||||||
def _normalize_domain(domain: str) -> str:
|
def _normalize_domain(domain: str) -> str:
|
||||||
|
|
@ -80,43 +84,27 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
||||||
await Stealth().apply_stealth_async(context)
|
await Stealth().apply_stealth_async(context)
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
# Versuch 1: Direkt zur Zielseite (z. B. https://www.mckinsey.com/featured-insights)
|
# domcontentloaded statt networkidle: reicht für Links, hängt nicht auf schweren Seiten (Analytics/Ads)
|
||||||
|
async def _goto(url: str) -> bool:
|
||||||
try:
|
try:
|
||||||
await page.goto(
|
await page.goto(url, wait_until="domcontentloaded", timeout=GOTO_TIMEOUT)
|
||||||
url_insights,
|
return True
|
||||||
wait_until="networkidle",
|
|
||||||
timeout=GOTO_TIMEOUT,
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Versuch 2: Startseite mit www
|
return False
|
||||||
|
|
||||||
|
logger.info("Playwright: loading page for domain=%s", domain)
|
||||||
|
if await _goto(url_insights):
|
||||||
|
logger.info("Playwright: loaded %s", url_insights)
|
||||||
|
elif await _goto(url_primary):
|
||||||
|
logger.info("Playwright: loaded %s", url_primary)
|
||||||
|
elif await _goto(url_fallback):
|
||||||
|
logger.info("Playwright: loaded %s", url_fallback)
|
||||||
|
else:
|
||||||
|
# Letzter Versuch mit etwas mehr Zeit
|
||||||
try:
|
try:
|
||||||
await page.goto(
|
await page.goto(url_primary, wait_until="domcontentloaded", timeout=35_000)
|
||||||
url_primary,
|
|
||||||
wait_until="networkidle",
|
|
||||||
timeout=GOTO_TIMEOUT,
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Versuch 3: ohne www
|
await page.goto(url_fallback, wait_until="domcontentloaded", timeout=35_000)
|
||||||
try:
|
|
||||||
await page.goto(
|
|
||||||
url_fallback,
|
|
||||||
wait_until="networkidle",
|
|
||||||
timeout=GOTO_TIMEOUT,
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
# Versuch 4: domcontentloaded als letzter Fallback
|
|
||||||
try:
|
|
||||||
await page.goto(
|
|
||||||
url_primary,
|
|
||||||
wait_until="domcontentloaded",
|
|
||||||
timeout=30_000,
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
await page.goto(
|
|
||||||
url_fallback,
|
|
||||||
wait_until="domcontentloaded",
|
|
||||||
timeout=30_000,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Alle <a href="..."> auslesen
|
# Alle <a href="..."> auslesen
|
||||||
links = await page.evaluate(
|
links = await page.evaluate(
|
||||||
|
|
@ -128,6 +116,7 @@ async def _fetch_links_impl(domain: str, disable_http2: bool = False) -> list[di
|
||||||
})).filter(x => x.href);
|
})).filter(x => x.href);
|
||||||
}"""
|
}"""
|
||||||
)
|
)
|
||||||
|
logger.info("Playwright: got %d links for domain=%s", len(links), domain)
|
||||||
await context.close()
|
await context.close()
|
||||||
finally:
|
finally:
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
@ -167,6 +156,7 @@ async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str
|
||||||
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
|
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
|
||||||
Erwartet Antwort im Format: {"url": "..."} – unverändert.
|
Erwartet Antwort im Format: {"url": "..."} – unverändert.
|
||||||
"""
|
"""
|
||||||
|
logger.info("OpenRouter: sending request for %d links (domain=%s)", len(links), domain)
|
||||||
base_url = _url_with_www(domain)
|
base_url = _url_with_www(domain)
|
||||||
prompt = (
|
prompt = (
|
||||||
"Analysiere diese Links einer Unternehmensberatung. "
|
"Analysiere diese Links einer Unternehmensberatung. "
|
||||||
|
|
@ -197,6 +187,7 @@ async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
logger.info("OpenRouter: got response for domain=%s", domain)
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
choice = (data.get("choices") or [None])[0]
|
choice = (data.get("choices") or [None])[0]
|
||||||
if not choice:
|
if not choice:
|
||||||
|
|
@ -239,6 +230,7 @@ async def get_publication_url(domain: str, *, api_key: str | None = None) -> dic
|
||||||
if not links:
|
if not links:
|
||||||
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
|
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
|
||||||
|
|
||||||
|
logger.info("Scout: calling OpenRouter for domain=%s", domain)
|
||||||
try:
|
try:
|
||||||
url = await _ask_openrouter(key, links, domain)
|
url = await _ask_openrouter(key, links, domain)
|
||||||
return {"url": url, "error": None}
|
return {"url": url, "error": None}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user