diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c07033f --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +# OpenRouter API-Key für Scout (LLM-Auswertung der Links) +# Erstellen: https://openrouter.ai/keys +OPENROUTER_API_KEY=sk-or-v1- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96d4112 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +.venv/ +venv/ + +# Umgebungsvariablen (WICHTIG!) +.env + +# Docker & Playwright +.pytest_cache/ +browser_data/ \ No newline at end of file diff --git a/Dockerfile.worker b/Dockerfile.worker new file mode 100644 index 0000000..54b66b6 --- /dev/null +++ b/Dockerfile.worker @@ -0,0 +1,27 @@ +# Scout-Modul CIA: FastAPI + Playwright (Browser) +FROM python:3.12-slim + +WORKDIR /app + +# System-Pakete für Playwright (Chromium) +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 \ + libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 \ + libxrandr2 libgbm1 libasound2 libpango-1.0-0 libcairo2 \ + wget ca-certificates fonts-liberation \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Playwright-Browser (Chromium) installieren +RUN playwright install chromium && playwright install-deps chromium + +COPY src/ ./src/ +WORKDIR /app/src + +# Port für FastAPI +EXPOSE 8000 + +ENV PYTHONUNBUFFERED=1 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..dc02254 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +# CIA Scout-Modul: FastAPI-Service auf Port 8000 +services: + scout: + build: + context: . + dockerfile: Dockerfile.worker + ports: + - "8000:8000" + env_file: + - .env + environment: + - OPENROUTER_API_KEY=${OPENROUTER_API_KEY} + restart: unless-stopped diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..67eaf9f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +# Scout-Modul CIA – Abhängigkeiten +fastapi>=0.109.0 +uvicorn[standard]>=0.27.0 +playwright>=1.41.0 +beautifulsoup4>=4.12.0 +httpx>=0.26.0 +python-dotenv>=1.0.0 +pydantic>=2.5.0 diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..332422c --- /dev/null +++ b/src/main.py @@ -0,0 +1,59 @@ +""" +Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain. +""" +import os + +from dotenv import load_dotenv +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +from scout_logic import get_publication_url + +# Umgebungsvariablen aus .env laden (OPENROUTER_API_KEY) +load_dotenv() + +app = FastAPI( + title="CIA Scout", + description="Erkennt Publikations-/Insights-Seiten von Beratungsdomains.", + version="0.1.0", +) + + +class DiscoverRequest(BaseModel): + """Request-Body für /discover.""" + + domain: str = Field(..., min_length=1, description="Domain, z.B. mckinsey.com") + + +class DiscoverResponse(BaseModel): + """Response: gefundene Publikations-URL oder Fehler.""" + + url: str | None = Field(None, description="Gefundene absolute URL für Reports/Insights") + error: str | None = Field(None, description="Fehlermeldung, falls kein Ergebnis") + + +@app.get("/health") +async def health(): + """Einfacher Health-Check.""" + return {"status": "ok"} + + +@app.post("/discover", response_model=DiscoverResponse) +async def discover(body: DiscoverRequest) -> DiscoverResponse: + """ + Domain übergeben; Service scannt die Startseite mit Playwright, + extrahiert Links und lässt OpenRouter die beste Publikations-URL wählen. + """ + if not os.getenv("OPENROUTER_API_KEY"): + raise HTTPException( + status_code=503, + detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)", + ) + + result = await get_publication_url(body.domain) + return DiscoverResponse(url=result["url"], error=result.get("error")) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/src/scout_logic.py b/src/scout_logic.py new file mode 100644 index 0000000..454b7cc --- /dev/null +++ b/src/scout_logic.py @@ -0,0 +1,141 @@ +""" +Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren. +""" +import json +import re +from typing import Any +from urllib.parse import urlparse + +import httpx +from playwright.async_api import async_playwright + +# OpenRouter Base-URL und Modell +OPENROUTER_BASE = "https://openrouter.ai/api/v1" +DEFAULT_MODEL = "google/gemini-flash-1.5-8b" + + +async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]: + """ + Lädt die Startseite der Domain mit Playwright (headless) und + extrahiert alle -Tags (Text und Href). + """ + # Domain mit Schema normalisieren + url = domain if domain.startswith("http") else f"https://{domain}" + links: list[dict[str, str]] = [] + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + try: + page = await browser.new_page() + await page.goto(url, wait_until="domcontentloaded", timeout=15000) + # Alle auslesen + links = await page.evaluate( + """() => { + const anchors = document.querySelectorAll('a[href]'); + return Array.from(anchors).map(a => ({ + text: (a.textContent || '').trim().slice(0, 200), + href: a.getAttribute('href') || '' + })).filter(x => x.href); + }""" + ) + finally: + await browser.close() + + return links + + +def _make_absolute(href: str, base_url: str) -> str: + """Macht relative URLs absolut (einfache Heuristik).""" + if not href or href.startswith("#"): + return "" + if href.startswith("http://") or href.startswith("https://"): + return href + base = base_url.rstrip("/") + if href.startswith("/"): + parsed = urlparse(base) + return f"{parsed.scheme}://{parsed.netloc}{href}" + return f"{base}/{href}" + + +async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str) -> str | None: + """ + Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an. + Erwartet Antwort im Format: {"url": "..."} + """ + base_url = domain if domain.startswith("http") else f"https://{domain}" + prompt = ( + "Analysiere diese Links einer Unternehmensberatung. " + "Welcher Link führt zur Seite mit Reports, Insights oder Fachartikeln? " + "Antworte NUR mit der absoluten URL im JSON-Format: {'url': '...'}" + ) + links_text = "\n".join( + f"- {l.get('text', '')} -> {_make_absolute(l.get('href', ''), base_url)}" + for l in links[:80] # Begrenzen, um Token-Limit zu schonen + ) + user_content = f"Links von {domain}:\n{links_text}" + + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.post( + f"{OPENROUTER_BASE}/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + "HTTP-Referer": base_url, + }, + json={ + "model": DEFAULT_MODEL, + "messages": [ + {"role": "system", "content": prompt}, + {"role": "user", "content": user_content}, + ], + "max_tokens": 256, + }, + ) + resp.raise_for_status() + data = resp.json() + choice = (data.get("choices") or [None])[0] + if not choice: + return None + content = (choice.get("message") or {}).get("content") or "" + if not content.strip(): + return None + + # JSON aus Antwort extrahieren (falls von Markdown umgeben) + content = content.strip() + json_match = re.search(r"\{[^{}]*\"url\"[^{}]*\}", content) + if json_match: + try: + obj = json.loads(json_match.group()) + return (obj.get("url") or "").strip() or None + except json.JSONDecodeError: + pass + try: + obj = json.loads(content) + return (obj.get("url") or "").strip() or None + except json.JSONDecodeError: + return None + + +async def get_publication_url(domain: str, *, api_key: str | None = None) -> dict[str, Any]: + """ + Hauptfunktion: Domain scannen, Links an OpenRouter senden, + gefundene Publikations-URL zurückgeben. + """ + import os + key = api_key or os.getenv("OPENROUTER_API_KEY") + if not key: + return {"url": None, "error": "OPENROUTER_API_KEY nicht gesetzt"} + + try: + links = await _fetch_links_with_playwright(domain) + except Exception as e: + return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"} + + if not links: + return {"url": None, "error": "Keine Links auf der Seite gefunden"} + + try: + url = await _ask_openrouter(key, links, domain) + return {"url": url, "error": None} + except Exception as e: + return {"url": None, "error": f"OpenRouter-Fehler: {e!s}"}