From 36fb27edf0a64aee0933e1f74807b74f61d92bda Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 23 Dec 2025 15:55:06 +0100 Subject: [PATCH] komplette openrouter integration --- app/config.py | 18 ++++---- app/core/ingestion.py | 32 +++++++------ app/services/llm_service.py | 89 +++++++++++++++++++++++++------------ config/prompts.yaml | 10 ++++- 4 files changed, 96 insertions(+), 53 deletions(-) diff --git a/app/config.py b/app/config.py index 2f1617b..df178ab 100644 --- a/app/config.py +++ b/app/config.py @@ -1,8 +1,8 @@ """ FILE: app/config.py -DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält alle Parameter für Qdrant, - lokale Embeddings, Ollama, Google GenAI und OpenRouter. -VERSION: 0.6.0 (WP-20 Full Hybrid Integration) +DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält Parameter für Qdrant, + Embeddings, Ollama, Google GenAI und OpenRouter. +VERSION: 0.6.0 (WP-20 Hybrid & OpenRouter Integration) STATUS: Active """ from __future__ import annotations @@ -21,16 +21,16 @@ class Settings: # --- Lokale Embeddings --- MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2") - # --- WP-20 Cloud Hybrid Mode (Google GenAI & OpenRouter) --- - # Erlaubt: "ollama" | "gemini" | "openrouter" + # --- WP-20 Hybrid LLM Provider --- + # Optionen: "ollama" | "gemini" | "openrouter" MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower() - # Google AI Studio (Direkt) + # Google AI Studio GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY") GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash") - GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") # Für Ingestion-Speed + GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") - # OpenRouter Integration + # OpenRouter OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY") OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free") @@ -51,7 +51,7 @@ class Settings: MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault") MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml") - # --- WP-04 Retriever Gewichte (Semantik vs. Graph) --- + # --- WP-04 Retriever Gewichte --- RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70")) RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25")) RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05")) diff --git a/app/core/ingestion.py b/app/core/ingestion.py index 1fbbf5e..e042de2 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -5,7 +5,7 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation. WP-22: Kontextsensitive Kanten-Validierung mit Fundort-Reporting (Zeilennummern). WP-22: Multi-Hash Refresh für konsistente Change Detection. -VERSION: 2.11.0 (WP-20 Full Integration: Hybrid Smart Edges) +VERSION: 2.11.1 (WP-20 Quota Protection: OpenRouter Priority) STATUS: Active DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry EXTERNAL_CONFIG: config/types.yaml, config/prompts.yaml @@ -111,7 +111,7 @@ class IngestionService: self.dim = self.cfg.dim if hasattr(self.cfg, 'dim') else self.settings.VECTOR_SIZE self.registry = load_type_registry() self.embedder = EmbeddingsClient() - self.llm = LLMService() # WP-20 Integration + self.llm = LLMService() # Change Detection Modus (full oder body) self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full") @@ -135,32 +135,36 @@ class IngestionService: async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]: """ WP-20: Nutzt den Hybrid LLM Service für die semantische Kanten-Extraktion. - Verwendet provider-spezifische Prompts aus der config. + QUOTEN-SCHUTZ: Priorisiert OpenRouter (Gemma), um Gemini-Tageslimits zu schonen. """ - # Wir priorisieren Gemma für Ingestion, falls verfügbar (OpenRouter/Cloud) - model = getattr(self.settings, "GEMMA_MODEL", None) - provider = self.settings.MINDNET_LLM_PROVIDER + # Bestimme den Provider für die Ingestion (OpenRouter bevorzugt, falls Key vorhanden) + provider = "openrouter" if getattr(self.settings, "OPENROUTER_API_KEY", None) else self.settings.MINDNET_LLM_PROVIDER - template = self.llm.get_prompt("edge_extraction") + # Nutze Gemma-Modell für hohe Ingestion-Quoten (14.4K RPD) via OpenRouter oder Google + model = getattr(self.settings, "GEMMA_MODEL", None) + + # Hole Prompt aus der YAML (Kaskade: Provider -> gemini -> ollama) + template = self.llm.get_prompt("edge_extraction", provider) prompt = template.format(text=text[:6000], note_id=note_id) try: - # Hintergrund-Task mit Semaphore + # Hintergrund-Task mit Semaphore via LLMService response_json = await self.llm.generate_raw_response( prompt=prompt, priority="background", force_json=True, + provider=provider, model_override=model ) data = json.loads(response_json) - # Provenance für die EdgeRegistry + # Provenance für die EdgeRegistry Dokumentation for item in data: item["provenance"] = "semantic_ai" item["line"] = f"ai-{provider}" return data except Exception as e: - logger.warning(f"Smart Edge Allocation skipped for {note_id}: {e}") + logger.warning(f"Smart Edge Allocation failed for {note_id} on {provider}: {e}") return [] async def process_file( @@ -214,7 +218,7 @@ class IngestionService: logger.error(f"Payload build failed: {e}") return {**result, "error": f"Payload build failed: {str(e)}"} - # 4. Change Detection + # 4. Change Detection (Multi-Hash) old_payload = None if not force_replace: old_payload = self._fetch_note_payload(note_id) @@ -255,7 +259,7 @@ class IngestionService: edges = [] context = {"file": file_path, "note_id": note_id} - # A. Explizite User-Kanten + # A. Explizite User-Kanten (Wiki-Links) explicit_edges = extract_edges_with_context(parsed) for e in explicit_edges: e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")}) @@ -267,7 +271,7 @@ class IngestionService: e["kind"] = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")}) edges.append(e) - # C. System-Kanten + # C. System-Kanten (Graph-Struktur) try: raw_system_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs) except TypeError: @@ -282,7 +286,7 @@ class IngestionService: logger.error(f"Processing failed: {e}", exc_info=True) return {**result, "error": f"Processing failed: {str(e)}"} - # 6. Upsert + # 6. Upsert in Qdrant try: if purge_before and has_old: self._purge_artifacts(note_id) diff --git a/app/services/llm_service.py b/app/services/llm_service.py index a0f4ac5..ecb30c4 100644 --- a/app/services/llm_service.py +++ b/app/services/llm_service.py @@ -1,8 +1,10 @@ """ FILE: app/services/llm_service.py -DESCRIPTION: Hybrid-Client für Ollama, Google GenAI und OpenRouter. +DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter. Verwaltet provider-spezifische Prompts und Background-Last. -VERSION: 3.3.0 (Full SDK Integration) + WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten. +VERSION: 3.3.1 +STATUS: Active """ import httpx import yaml @@ -11,7 +13,7 @@ import asyncio import json from google import genai from google.genai import types -from openai import AsyncOpenAI # Für OpenRouter +from openai import AsyncOpenAI # Für OpenRouter (OpenAI-kompatibel) from pathlib import Path from typing import Optional, Dict, Any, Literal from app.config import get_settings @@ -19,16 +21,17 @@ from app.config import get_settings logger = logging.getLogger(__name__) class LLMService: + # GLOBALER SEMAPHOR für Hintergrund-Last Steuerung (WP-06) _background_semaphore = None def __init__(self): self.settings = get_settings() self.prompts = self._load_prompts() - # WP-06: Semaphore-Initialisierung + # Initialisiere Semaphore einmalig auf Klassen-Ebene if LLMService._background_semaphore is None: - limit = self.settings.BACKGROUND_LIMIT - logger.info(f"🚦 LLMService: Background Semaphore initialized with limit: {limit}") + limit = getattr(self.settings, "BACKGROUND_LIMIT", 2) + logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}") LLMService._background_semaphore = asyncio.Semaphore(limit) # 1. Lokaler Ollama Client @@ -53,6 +56,7 @@ class LLMService: logger.info("🛰️ LLMService: OpenRouter Integration active.") def _load_prompts(self) -> dict: + """Lädt die Prompt-Konfiguration aus der YAML-Datei.""" path = Path(self.settings.PROMPTS_PATH) if not path.exists(): return {} try: @@ -62,11 +66,16 @@ class LLMService: return {} def get_prompt(self, key: str, provider: str = None) -> str: - """Hole provider-spezifisches Template mit Fallback-Kaskade.""" + """ + Hole provider-spezifisches Template mit intelligenter Text-Kaskade. + HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent. + Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil). + """ active_provider = provider or self.settings.MINDNET_LLM_PROVIDER data = self.prompts.get(key, "") if isinstance(data, dict): - return data.get(active_provider, data.get("ollama", "")) + # Wir versuchen erst den Provider, dann Gemini (weil ähnlich leistungsfähig), dann Ollama + return data.get(active_provider, data.get("gemini", data.get("ollama", ""))) return str(data) async def generate_raw_response( @@ -76,35 +85,43 @@ class LLMService: provider: Optional[str] = None, model_override: Optional[str] = None ) -> str: - """Einstiegspunkt mit Priority-Handling.""" + """Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung.""" target_provider = provider or self.settings.MINDNET_LLM_PROVIDER if priority == "background": async with LLMService._background_semaphore: return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override) + return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override) async def _dispatch(self, provider, prompt, system, force_json, max_retries, base_delay, model_override): + """Routet die Anfrage an den physikalischen API-Provider.""" try: if provider == "openrouter" and self.openrouter_client: return await self._execute_openrouter(prompt, system, force_json, model_override) + if provider == "gemini" and self.google_client: return await self._execute_google(prompt, system, force_json, model_override) + + # Default/Fallback zu Ollama return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) + except Exception as e: + # QUOTEN-SCHUTZ: Wenn Cloud (OpenRouter/Gemini) fehlschlägt, + # gehen wir IMMER zu Ollama, niemals von OpenRouter zu Gemini. if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama": - logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to Ollama.") + logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to LOCAL OLLAMA to protect cloud quotas.") return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) raise e async def _execute_google(self, prompt, system, force_json, model_override): - """Native Google SDK Integration.""" + """Native Google SDK Integration (Gemini).""" model = model_override or self.settings.GEMINI_MODEL config = types.GenerateContentConfig( system_instruction=system, response_mime_type="application/json" if force_json else "text/plain" ) - # Synchroner SDK-Call in Thread auslagern + # SDK Call in Thread auslagern, da die Google API blocking sein kann response = await asyncio.to_thread( self.google_client.models.generate_content, model=model, contents=prompt, config=config @@ -112,10 +129,11 @@ class LLMService: return response.text.strip() async def _execute_openrouter(self, prompt, system, force_json, model_override): - """OpenRouter (OpenAI-kompatibel).""" - model = model_override or self.settings.OPENROUTER_MODEL + """OpenRouter API Integration (OpenAI-kompatibel).""" + model = model_override or getattr(self.settings, "OPENROUTER_MODEL", "google/gemma-2-9b-it:free") messages = [] - if system: messages.append({"role": "system", "content": system}) + if system: + messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) response = await self.openrouter_client.chat.completions.create( @@ -126,10 +144,15 @@ class LLMService: return response.choices[0].message.content.strip() async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay): - """Ollama mit exponentiellem Backoff.""" + """Lokaler Ollama Call mit exponentiellem Backoff.""" payload = { - "model": self.settings.LLM_MODEL, "prompt": prompt, "stream": False, - "options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192} + "model": self.settings.LLM_MODEL, + "prompt": prompt, + "stream": False, + "options": { + "temperature": 0.1 if force_json else 0.7, + "num_ctx": 8192 + } } if force_json: payload["format"] = "json" if system: payload["system"] = system @@ -142,18 +165,28 @@ class LLMService: return res.json().get("response", "").strip() except Exception as e: attempt += 1 - if attempt > max_retries: raise e - wait = base_delay * (2 ** (attempt - 1)) - logger.warning(f"⚠️ Ollama retry {attempt} in {wait}s...") - await asyncio.sleep(wait) + if attempt > max_retries: + logger.error(f"Ollama Error after {attempt} retries: {e}") + raise e + wait_time = base_delay * (2 ** (attempt - 1)) + logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...") + await asyncio.sleep(wait_time) async def generate_rag_response(self, query: str, context_str: str) -> str: - """Vollständiger RAG-Wrapper.""" + """Vollständiges RAG Chat-Interface.""" provider = self.settings.MINDNET_LLM_PROVIDER - system = self.get_prompt("system_prompt", provider) - template = self.get_prompt("rag_template", provider) - final_prompt = template.format(context_str=context_str, query=query) - return await self.generate_raw_response(final_prompt, system=system, priority="realtime") + system_prompt = self.get_prompt("system_prompt", provider) + rag_template = self.get_prompt("rag_template", provider) + + final_prompt = rag_template.format(context_str=context_str, query=query) + + return await self.generate_raw_response( + final_prompt, + system=system_prompt, + priority="realtime" + ) async def close(self): - await self.ollama_client.aclose() \ No newline at end of file + """Schließt die HTTP-Verbindungen.""" + if self.ollama_client: + await self.ollama_client.aclose() \ No newline at end of file diff --git a/config/prompts.yaml b/config/prompts.yaml index 5574383..f52d0bc 100644 --- a/config/prompts.yaml +++ b/config/prompts.yaml @@ -32,6 +32,8 @@ rag_template: Analysiere diesen Kontext meines digitalen Zwillings: {context_str} Beantworte die Anfrage detailliert und prüfe auf Widersprüche: {query} + openrouter: "Kontext-Analyse für Gemma/Llama: {context_str}\n\nAnfrage: {query}" + # --------------------------------------------------------- # 2. DECISION: Strategie & Abwägung (Intent: DECISION) @@ -59,7 +61,7 @@ decision_template: gemini: | Agierte als Senior Strategy Consultant. Nutze den Kontext {context_str}, um die Frage {query} tiefgreifend gegen meine langfristigen Ziele abzuwägen. - + openrouter: "Strategischer Check (OpenRouter): {query}\n\nReferenzdaten: {context_str}" # --------------------------------------------------------- # 3. EMPATHY: Der Spiegel / "Ich"-Modus (Intent: EMPATHY) # --------------------------------------------------------- @@ -178,4 +180,8 @@ edge_allocation_template: Extrahiere semantische Kanten für den Graphen ({note_id}). Finde auch implizite Verbindungen. JSON: [{"to": "X", "kind": "Y", "reason": "Z"}]. - TEXT: {text} \ No newline at end of file + TEXT: {text} + openrouter: | + Analysiere den Text für den Graphen. Identifiziere semantische Verbindungen. + Output JSON: [{"to": "X", "kind": "Y"}]. + Text: {text} \ No newline at end of file