komplette openrouter integration

This commit is contained in:
Lars 2025-12-23 15:55:06 +01:00
parent c60aba63a4
commit 36fb27edf0
4 changed files with 96 additions and 53 deletions

View File

@ -1,8 +1,8 @@
"""
FILE: app/config.py
DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält alle Parameter für Qdrant,
lokale Embeddings, Ollama, Google GenAI und OpenRouter.
VERSION: 0.6.0 (WP-20 Full Hybrid Integration)
DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält Parameter für Qdrant,
Embeddings, Ollama, Google GenAI und OpenRouter.
VERSION: 0.6.0 (WP-20 Hybrid & OpenRouter Integration)
STATUS: Active
"""
from __future__ import annotations
@ -21,16 +21,16 @@ class Settings:
# --- Lokale Embeddings ---
MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
# --- WP-20 Cloud Hybrid Mode (Google GenAI & OpenRouter) ---
# Erlaubt: "ollama" | "gemini" | "openrouter"
# --- WP-20 Hybrid LLM Provider ---
# Optionen: "ollama" | "gemini" | "openrouter"
MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower()
# Google AI Studio (Direkt)
# Google AI Studio
GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY")
GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash")
GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") # Für Ingestion-Speed
GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it")
# OpenRouter Integration
# OpenRouter
OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
@ -51,7 +51,7 @@ class Settings:
MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault")
MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
# --- WP-04 Retriever Gewichte (Semantik vs. Graph) ---
# --- WP-04 Retriever Gewichte ---
RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70"))
RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25"))
RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05"))

View File

@ -5,7 +5,7 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes
WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation.
WP-22: Kontextsensitive Kanten-Validierung mit Fundort-Reporting (Zeilennummern).
WP-22: Multi-Hash Refresh für konsistente Change Detection.
VERSION: 2.11.0 (WP-20 Full Integration: Hybrid Smart Edges)
VERSION: 2.11.1 (WP-20 Quota Protection: OpenRouter Priority)
STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry
EXTERNAL_CONFIG: config/types.yaml, config/prompts.yaml
@ -111,7 +111,7 @@ class IngestionService:
self.dim = self.cfg.dim if hasattr(self.cfg, 'dim') else self.settings.VECTOR_SIZE
self.registry = load_type_registry()
self.embedder = EmbeddingsClient()
self.llm = LLMService() # WP-20 Integration
self.llm = LLMService()
# Change Detection Modus (full oder body)
self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
@ -135,32 +135,36 @@ class IngestionService:
async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]:
"""
WP-20: Nutzt den Hybrid LLM Service für die semantische Kanten-Extraktion.
Verwendet provider-spezifische Prompts aus der config.
QUOTEN-SCHUTZ: Priorisiert OpenRouter (Gemma), um Gemini-Tageslimits zu schonen.
"""
# Wir priorisieren Gemma für Ingestion, falls verfügbar (OpenRouter/Cloud)
model = getattr(self.settings, "GEMMA_MODEL", None)
provider = self.settings.MINDNET_LLM_PROVIDER
# Bestimme den Provider für die Ingestion (OpenRouter bevorzugt, falls Key vorhanden)
provider = "openrouter" if getattr(self.settings, "OPENROUTER_API_KEY", None) else self.settings.MINDNET_LLM_PROVIDER
template = self.llm.get_prompt("edge_extraction")
# Nutze Gemma-Modell für hohe Ingestion-Quoten (14.4K RPD) via OpenRouter oder Google
model = getattr(self.settings, "GEMMA_MODEL", None)
# Hole Prompt aus der YAML (Kaskade: Provider -> gemini -> ollama)
template = self.llm.get_prompt("edge_extraction", provider)
prompt = template.format(text=text[:6000], note_id=note_id)
try:
# Hintergrund-Task mit Semaphore
# Hintergrund-Task mit Semaphore via LLMService
response_json = await self.llm.generate_raw_response(
prompt=prompt,
priority="background",
force_json=True,
provider=provider,
model_override=model
)
data = json.loads(response_json)
# Provenance für die EdgeRegistry
# Provenance für die EdgeRegistry Dokumentation
for item in data:
item["provenance"] = "semantic_ai"
item["line"] = f"ai-{provider}"
return data
except Exception as e:
logger.warning(f"Smart Edge Allocation skipped for {note_id}: {e}")
logger.warning(f"Smart Edge Allocation failed for {note_id} on {provider}: {e}")
return []
async def process_file(
@ -214,7 +218,7 @@ class IngestionService:
logger.error(f"Payload build failed: {e}")
return {**result, "error": f"Payload build failed: {str(e)}"}
# 4. Change Detection
# 4. Change Detection (Multi-Hash)
old_payload = None
if not force_replace:
old_payload = self._fetch_note_payload(note_id)
@ -255,7 +259,7 @@ class IngestionService:
edges = []
context = {"file": file_path, "note_id": note_id}
# A. Explizite User-Kanten
# A. Explizite User-Kanten (Wiki-Links)
explicit_edges = extract_edges_with_context(parsed)
for e in explicit_edges:
e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")})
@ -267,7 +271,7 @@ class IngestionService:
e["kind"] = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")})
edges.append(e)
# C. System-Kanten
# C. System-Kanten (Graph-Struktur)
try:
raw_system_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs)
except TypeError:
@ -282,7 +286,7 @@ class IngestionService:
logger.error(f"Processing failed: {e}", exc_info=True)
return {**result, "error": f"Processing failed: {str(e)}"}
# 6. Upsert
# 6. Upsert in Qdrant
try:
if purge_before and has_old: self._purge_artifacts(note_id)

View File

@ -1,8 +1,10 @@
"""
FILE: app/services/llm_service.py
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI und OpenRouter.
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
Verwaltet provider-spezifische Prompts und Background-Last.
VERSION: 3.3.0 (Full SDK Integration)
WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
VERSION: 3.3.1
STATUS: Active
"""
import httpx
import yaml
@ -11,7 +13,7 @@ import asyncio
import json
from google import genai
from google.genai import types
from openai import AsyncOpenAI # Für OpenRouter
from openai import AsyncOpenAI # Für OpenRouter (OpenAI-kompatibel)
from pathlib import Path
from typing import Optional, Dict, Any, Literal
from app.config import get_settings
@ -19,16 +21,17 @@ from app.config import get_settings
logger = logging.getLogger(__name__)
class LLMService:
# GLOBALER SEMAPHOR für Hintergrund-Last Steuerung (WP-06)
_background_semaphore = None
def __init__(self):
self.settings = get_settings()
self.prompts = self._load_prompts()
# WP-06: Semaphore-Initialisierung
# Initialisiere Semaphore einmalig auf Klassen-Ebene
if LLMService._background_semaphore is None:
limit = self.settings.BACKGROUND_LIMIT
logger.info(f"🚦 LLMService: Background Semaphore initialized with limit: {limit}")
limit = getattr(self.settings, "BACKGROUND_LIMIT", 2)
logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}")
LLMService._background_semaphore = asyncio.Semaphore(limit)
# 1. Lokaler Ollama Client
@ -53,6 +56,7 @@ class LLMService:
logger.info("🛰️ LLMService: OpenRouter Integration active.")
def _load_prompts(self) -> dict:
"""Lädt die Prompt-Konfiguration aus der YAML-Datei."""
path = Path(self.settings.PROMPTS_PATH)
if not path.exists(): return {}
try:
@ -62,11 +66,16 @@ class LLMService:
return {}
def get_prompt(self, key: str, provider: str = None) -> str:
"""Hole provider-spezifisches Template mit Fallback-Kaskade."""
"""
Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent.
Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil).
"""
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
data = self.prompts.get(key, "")
if isinstance(data, dict):
return data.get(active_provider, data.get("ollama", ""))
# Wir versuchen erst den Provider, dann Gemini (weil ähnlich leistungsfähig), dann Ollama
return data.get(active_provider, data.get("gemini", data.get("ollama", "")))
return str(data)
async def generate_raw_response(
@ -76,35 +85,43 @@ class LLMService:
provider: Optional[str] = None,
model_override: Optional[str] = None
) -> str:
"""Einstiegspunkt mit Priority-Handling."""
"""Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung."""
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
if priority == "background":
async with LLMService._background_semaphore:
return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
async def _dispatch(self, provider, prompt, system, force_json, max_retries, base_delay, model_override):
"""Routet die Anfrage an den physikalischen API-Provider."""
try:
if provider == "openrouter" and self.openrouter_client:
return await self._execute_openrouter(prompt, system, force_json, model_override)
if provider == "gemini" and self.google_client:
return await self._execute_google(prompt, system, force_json, model_override)
# Default/Fallback zu Ollama
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
except Exception as e:
# QUOTEN-SCHUTZ: Wenn Cloud (OpenRouter/Gemini) fehlschlägt,
# gehen wir IMMER zu Ollama, niemals von OpenRouter zu Gemini.
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to Ollama.")
logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to LOCAL OLLAMA to protect cloud quotas.")
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
raise e
async def _execute_google(self, prompt, system, force_json, model_override):
"""Native Google SDK Integration."""
"""Native Google SDK Integration (Gemini)."""
model = model_override or self.settings.GEMINI_MODEL
config = types.GenerateContentConfig(
system_instruction=system,
response_mime_type="application/json" if force_json else "text/plain"
)
# Synchroner SDK-Call in Thread auslagern
# SDK Call in Thread auslagern, da die Google API blocking sein kann
response = await asyncio.to_thread(
self.google_client.models.generate_content,
model=model, contents=prompt, config=config
@ -112,10 +129,11 @@ class LLMService:
return response.text.strip()
async def _execute_openrouter(self, prompt, system, force_json, model_override):
"""OpenRouter (OpenAI-kompatibel)."""
model = model_override or self.settings.OPENROUTER_MODEL
"""OpenRouter API Integration (OpenAI-kompatibel)."""
model = model_override or getattr(self.settings, "OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
messages = []
if system: messages.append({"role": "system", "content": system})
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
response = await self.openrouter_client.chat.completions.create(
@ -126,10 +144,15 @@ class LLMService:
return response.choices[0].message.content.strip()
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
"""Ollama mit exponentiellem Backoff."""
"""Lokaler Ollama Call mit exponentiellem Backoff."""
payload = {
"model": self.settings.LLM_MODEL, "prompt": prompt, "stream": False,
"options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
"model": self.settings.LLM_MODEL,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1 if force_json else 0.7,
"num_ctx": 8192
}
}
if force_json: payload["format"] = "json"
if system: payload["system"] = system
@ -142,18 +165,28 @@ class LLMService:
return res.json().get("response", "").strip()
except Exception as e:
attempt += 1
if attempt > max_retries: raise e
wait = base_delay * (2 ** (attempt - 1))
logger.warning(f"⚠️ Ollama retry {attempt} in {wait}s...")
await asyncio.sleep(wait)
if attempt > max_retries:
logger.error(f"Ollama Error after {attempt} retries: {e}")
raise e
wait_time = base_delay * (2 ** (attempt - 1))
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
async def generate_rag_response(self, query: str, context_str: str) -> str:
"""Vollständiger RAG-Wrapper."""
"""Vollständiges RAG Chat-Interface."""
provider = self.settings.MINDNET_LLM_PROVIDER
system = self.get_prompt("system_prompt", provider)
template = self.get_prompt("rag_template", provider)
final_prompt = template.format(context_str=context_str, query=query)
return await self.generate_raw_response(final_prompt, system=system, priority="realtime")
system_prompt = self.get_prompt("system_prompt", provider)
rag_template = self.get_prompt("rag_template", provider)
final_prompt = rag_template.format(context_str=context_str, query=query)
return await self.generate_raw_response(
final_prompt,
system=system_prompt,
priority="realtime"
)
async def close(self):
await self.ollama_client.aclose()
"""Schließt die HTTP-Verbindungen."""
if self.ollama_client:
await self.ollama_client.aclose()

View File

@ -32,6 +32,8 @@ rag_template:
Analysiere diesen Kontext meines digitalen Zwillings:
{context_str}
Beantworte die Anfrage detailliert und prüfe auf Widersprüche: {query}
openrouter: "Kontext-Analyse für Gemma/Llama: {context_str}\n\nAnfrage: {query}"
# ---------------------------------------------------------
# 2. DECISION: Strategie & Abwägung (Intent: DECISION)
@ -59,7 +61,7 @@ decision_template:
gemini: |
Agierte als Senior Strategy Consultant. Nutze den Kontext {context_str}, um die Frage {query}
tiefgreifend gegen meine langfristigen Ziele abzuwägen.
openrouter: "Strategischer Check (OpenRouter): {query}\n\nReferenzdaten: {context_str}"
# ---------------------------------------------------------
# 3. EMPATHY: Der Spiegel / "Ich"-Modus (Intent: EMPATHY)
# ---------------------------------------------------------
@ -179,3 +181,7 @@ edge_allocation_template:
Finde auch implizite Verbindungen.
JSON: [{"to": "X", "kind": "Y", "reason": "Z"}].
TEXT: {text}
openrouter: |
Analysiere den Text für den Graphen. Identifiziere semantische Verbindungen.
Output JSON: [{"to": "X", "kind": "Y"}].
Text: {text}