Updated project structure and added initial configuration files.
This commit is contained in:
parent
bd0e602b09
commit
6e813daf69
3
.env.example
Normal file
3
.env.example
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
# OpenRouter API-Key für Scout (LLM-Auswertung der Links)
|
||||||
|
# Erstellen: https://openrouter.ai/keys
|
||||||
|
OPENROUTER_API_KEY=sk-or-v1-
|
||||||
13
.gitignore
vendored
Normal file
13
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
|
||||||
|
# Umgebungsvariablen (WICHTIG!)
|
||||||
|
.env
|
||||||
|
|
||||||
|
# Docker & Playwright
|
||||||
|
.pytest_cache/
|
||||||
|
browser_data/
|
||||||
27
Dockerfile.worker
Normal file
27
Dockerfile.worker
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Scout-Modul CIA: FastAPI + Playwright (Browser)
|
||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# System-Pakete für Playwright (Chromium)
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 \
|
||||||
|
libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 \
|
||||||
|
libxrandr2 libgbm1 libasound2 libpango-1.0-0 libcairo2 \
|
||||||
|
wget ca-certificates fonts-liberation \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Playwright-Browser (Chromium) installieren
|
||||||
|
RUN playwright install chromium && playwright install-deps chromium
|
||||||
|
|
||||||
|
COPY src/ ./src/
|
||||||
|
WORKDIR /app/src
|
||||||
|
|
||||||
|
# Port für FastAPI
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
13
docker-compose.yml
Normal file
13
docker-compose.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
# CIA Scout-Modul: FastAPI-Service auf Port 8000
|
||||||
|
services:
|
||||||
|
scout:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile.worker
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
environment:
|
||||||
|
- OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
|
||||||
|
restart: unless-stopped
|
||||||
8
requirements.txt
Normal file
8
requirements.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
# Scout-Modul CIA – Abhängigkeiten
|
||||||
|
fastapi>=0.109.0
|
||||||
|
uvicorn[standard]>=0.27.0
|
||||||
|
playwright>=1.41.0
|
||||||
|
beautifulsoup4>=4.12.0
|
||||||
|
httpx>=0.26.0
|
||||||
|
python-dotenv>=1.0.0
|
||||||
|
pydantic>=2.5.0
|
||||||
59
src/main.py
Normal file
59
src/main.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
"""
|
||||||
|
Scout-Modul CIA: FastAPI-Service zum Erkennen von Publikations-/Insights-URLs pro Domain.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from scout_logic import get_publication_url
|
||||||
|
|
||||||
|
# Umgebungsvariablen aus .env laden (OPENROUTER_API_KEY)
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="CIA Scout",
|
||||||
|
description="Erkennt Publikations-/Insights-Seiten von Beratungsdomains.",
|
||||||
|
version="0.1.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DiscoverRequest(BaseModel):
|
||||||
|
"""Request-Body für /discover."""
|
||||||
|
|
||||||
|
domain: str = Field(..., min_length=1, description="Domain, z.B. mckinsey.com")
|
||||||
|
|
||||||
|
|
||||||
|
class DiscoverResponse(BaseModel):
|
||||||
|
"""Response: gefundene Publikations-URL oder Fehler."""
|
||||||
|
|
||||||
|
url: str | None = Field(None, description="Gefundene absolute URL für Reports/Insights")
|
||||||
|
error: str | None = Field(None, description="Fehlermeldung, falls kein Ergebnis")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
"""Einfacher Health-Check."""
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/discover", response_model=DiscoverResponse)
|
||||||
|
async def discover(body: DiscoverRequest) -> DiscoverResponse:
|
||||||
|
"""
|
||||||
|
Domain übergeben; Service scannt die Startseite mit Playwright,
|
||||||
|
extrahiert Links und lässt OpenRouter die beste Publikations-URL wählen.
|
||||||
|
"""
|
||||||
|
if not os.getenv("OPENROUTER_API_KEY"):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail="OPENROUTER_API_KEY nicht gesetzt (z.B. in .env)",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await get_publication_url(body.domain)
|
||||||
|
return DiscoverResponse(url=result["url"], error=result.get("error"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
141
src/scout_logic.py
Normal file
141
src/scout_logic.py
Normal file
|
|
@ -0,0 +1,141 @@
|
||||||
|
"""
|
||||||
|
Scout-Logik: Domain scannen, Links extrahieren, via OpenRouter Publikations-URL identifizieren.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
# OpenRouter Base-URL und Modell
|
||||||
|
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
|
||||||
|
DEFAULT_MODEL = "google/gemini-flash-1.5-8b"
|
||||||
|
|
||||||
|
|
||||||
|
async def _fetch_links_with_playwright(domain: str) -> list[dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Lädt die Startseite der Domain mit Playwright (headless) und
|
||||||
|
extrahiert alle <a>-Tags (Text und Href).
|
||||||
|
"""
|
||||||
|
# Domain mit Schema normalisieren
|
||||||
|
url = domain if domain.startswith("http") else f"https://{domain}"
|
||||||
|
links: list[dict[str, str]] = []
|
||||||
|
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
try:
|
||||||
|
page = await browser.new_page()
|
||||||
|
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
||||||
|
# Alle <a href="..."> auslesen
|
||||||
|
links = await page.evaluate(
|
||||||
|
"""() => {
|
||||||
|
const anchors = document.querySelectorAll('a[href]');
|
||||||
|
return Array.from(anchors).map(a => ({
|
||||||
|
text: (a.textContent || '').trim().slice(0, 200),
|
||||||
|
href: a.getAttribute('href') || ''
|
||||||
|
})).filter(x => x.href);
|
||||||
|
}"""
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def _make_absolute(href: str, base_url: str) -> str:
|
||||||
|
"""Macht relative URLs absolut (einfache Heuristik)."""
|
||||||
|
if not href or href.startswith("#"):
|
||||||
|
return ""
|
||||||
|
if href.startswith("http://") or href.startswith("https://"):
|
||||||
|
return href
|
||||||
|
base = base_url.rstrip("/")
|
||||||
|
if href.startswith("/"):
|
||||||
|
parsed = urlparse(base)
|
||||||
|
return f"{parsed.scheme}://{parsed.netloc}{href}"
|
||||||
|
return f"{base}/{href}"
|
||||||
|
|
||||||
|
|
||||||
|
async def _ask_openrouter(api_key: str, links: list[dict[str, str]], domain: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Sendet die Link-Liste an OpenRouter und fordert die beste Publikations-URL an.
|
||||||
|
Erwartet Antwort im Format: {"url": "..."}
|
||||||
|
"""
|
||||||
|
base_url = domain if domain.startswith("http") else f"https://{domain}"
|
||||||
|
prompt = (
|
||||||
|
"Analysiere diese Links einer Unternehmensberatung. "
|
||||||
|
"Welcher Link führt zur Seite mit Reports, Insights oder Fachartikeln? "
|
||||||
|
"Antworte NUR mit der absoluten URL im JSON-Format: {'url': '...'}"
|
||||||
|
)
|
||||||
|
links_text = "\n".join(
|
||||||
|
f"- {l.get('text', '')} -> {_make_absolute(l.get('href', ''), base_url)}"
|
||||||
|
for l in links[:80] # Begrenzen, um Token-Limit zu schonen
|
||||||
|
)
|
||||||
|
user_content = f"Links von {domain}:\n{links_text}"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{OPENROUTER_BASE}/chat/completions",
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"HTTP-Referer": base_url,
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"model": DEFAULT_MODEL,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": prompt},
|
||||||
|
{"role": "user", "content": user_content},
|
||||||
|
],
|
||||||
|
"max_tokens": 256,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
choice = (data.get("choices") or [None])[0]
|
||||||
|
if not choice:
|
||||||
|
return None
|
||||||
|
content = (choice.get("message") or {}).get("content") or ""
|
||||||
|
if not content.strip():
|
||||||
|
return None
|
||||||
|
|
||||||
|
# JSON aus Antwort extrahieren (falls von Markdown umgeben)
|
||||||
|
content = content.strip()
|
||||||
|
json_match = re.search(r"\{[^{}]*\"url\"[^{}]*\}", content)
|
||||||
|
if json_match:
|
||||||
|
try:
|
||||||
|
obj = json.loads(json_match.group())
|
||||||
|
return (obj.get("url") or "").strip() or None
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
obj = json.loads(content)
|
||||||
|
return (obj.get("url") or "").strip() or None
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def get_publication_url(domain: str, *, api_key: str | None = None) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Hauptfunktion: Domain scannen, Links an OpenRouter senden,
|
||||||
|
gefundene Publikations-URL zurückgeben.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
key = api_key or os.getenv("OPENROUTER_API_KEY")
|
||||||
|
if not key:
|
||||||
|
return {"url": None, "error": "OPENROUTER_API_KEY nicht gesetzt"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
links = await _fetch_links_with_playwright(domain)
|
||||||
|
except Exception as e:
|
||||||
|
return {"url": None, "error": f"Playwright/Scrape-Fehler: {e!s}"}
|
||||||
|
|
||||||
|
if not links:
|
||||||
|
return {"url": None, "error": "Keine Links auf der Seite gefunden"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = await _ask_openrouter(key, links, domain)
|
||||||
|
return {"url": url, "error": None}
|
||||||
|
except Exception as e:
|
||||||
|
return {"url": None, "error": f"OpenRouter-Fehler: {e!s}"}
|
||||||
Loading…
Reference in New Issue
Block a user