From 2a98c37ca117a4dbe1f010d7db9fc1c3641be098 Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 23 Dec 2025 15:09:47 +0100 Subject: [PATCH] integration openrouter --- app/config.py | 33 +++--- app/services/llm_service.py | 204 ++++++++++++++++-------------------- requirements.txt | 5 +- 3 files changed, 113 insertions(+), 129 deletions(-) diff --git a/app/config.py b/app/config.py index 5774a53..2f1617b 100644 --- a/app/config.py +++ b/app/config.py @@ -1,11 +1,9 @@ """ FILE: app/config.py -DESCRIPTION: Zentrale Pydantic-Konfiguration (Env-Vars für Qdrant, LLM, Retriever). - Erweitert um WP-20 Hybrid-Optionen. -VERSION: 0.5.0 +DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält alle Parameter für Qdrant, + lokale Embeddings, Ollama, Google GenAI und OpenRouter. +VERSION: 0.6.0 (WP-20 Full Hybrid Integration) STATUS: Active -DEPENDENCIES: os, functools, pathlib -LAST_ANALYSIS: 2025-12-23 """ from __future__ import annotations import os @@ -13,38 +11,47 @@ from functools import lru_cache from pathlib import Path class Settings: - # Qdrant Verbindung + # --- Qdrant Datenbank --- QDRANT_URL: str = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") QDRANT_API_KEY: str | None = os.getenv("QDRANT_API_KEY") COLLECTION_PREFIX: str = os.getenv("MINDNET_PREFIX", "mindnet") VECTOR_SIZE: int = int(os.getenv("MINDNET_VECTOR_SIZE", "384")) DISTANCE: str = os.getenv("MINDNET_DISTANCE", "Cosine") - # Embeddings (lokal) + # --- Lokale Embeddings --- MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2") - # WP-20 Hybrid LLM Provider - # Erlaubt: "ollama" oder "gemini" + # --- WP-20 Cloud Hybrid Mode (Google GenAI & OpenRouter) --- + # Erlaubt: "ollama" | "gemini" | "openrouter" MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower() + + # Google AI Studio (Direkt) GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY") GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash") + GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") # Für Ingestion-Speed + + # OpenRouter Integration + OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY") + OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free") + LLM_FALLBACK_ENABLED: bool = os.getenv("MINDNET_LLM_FALLBACK", "true").lower() == "true" - # WP-05 LLM / Ollama (Local) + # --- WP-05 Lokales LLM (Ollama) --- OLLAMA_URL: str = os.getenv("MINDNET_OLLAMA_URL", "http://127.0.0.1:11434") LLM_MODEL: str = os.getenv("MINDNET_LLM_MODEL", "phi3:mini") PROMPTS_PATH: str = os.getenv("MINDNET_PROMPTS_PATH", "config/prompts.yaml") - # WP-06 / WP-14 Performance & Timeouts + # --- WP-06 / WP-14 Performance & Last-Steuerung --- LLM_TIMEOUT: float = float(os.getenv("MINDNET_LLM_TIMEOUT", "120.0")) DECISION_CONFIG_PATH: str = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml") BACKGROUND_LIMIT: int = int(os.getenv("MINDNET_LLM_BACKGROUND_LIMIT", "2")) - # API & Debugging + # --- System-Pfade --- DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true" MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault") + MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml") - # WP-04 Retriever Gewichte (Semantik vs. Graph) + # --- WP-04 Retriever Gewichte (Semantik vs. Graph) --- RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70")) RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25")) RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05")) diff --git a/app/services/llm_service.py b/app/services/llm_service.py index df4cdd1..a0f4ac5 100644 --- a/app/services/llm_service.py +++ b/app/services/llm_service.py @@ -1,20 +1,17 @@ """ FILE: app/services/llm_service.py -DESCRIPTION: Hybrid-Client für Ollama & Google Gemini. - Verwaltet Prompts, Background-Last (Semaphore) und Cloud-Routing. -VERSION: 3.1.0 (WP-20 Full Integration: Provider-Aware Prompting) -STATUS: Active -DEPENDENCIES: httpx, yaml, asyncio, google-generativeai, app.config -EXTERNAL_CONFIG: config/prompts.yaml +DESCRIPTION: Hybrid-Client für Ollama, Google GenAI und OpenRouter. + Verwaltet provider-spezifische Prompts und Background-Last. +VERSION: 3.3.0 (Full SDK Integration) """ - import httpx import yaml import logging -import os import asyncio import json -import google.generativeai as genai +from google import genai +from google.genai import types +from openai import AsyncOpenAI # Für OpenRouter from pathlib import Path from typing import Optional, Dict, Any, Literal from app.config import get_settings @@ -22,122 +19,117 @@ from app.config import get_settings logger = logging.getLogger(__name__) class LLMService: - # GLOBALER SEMAPHOR für Hintergrund-Last Steuerung (WP-06 / WP-20) _background_semaphore = None def __init__(self): self.settings = get_settings() self.prompts = self._load_prompts() - # Initialisiere Semaphore einmalig auf Klassen-Ebene + # WP-06: Semaphore-Initialisierung if LLMService._background_semaphore is None: - limit = getattr(self.settings, "BACKGROUND_LIMIT", 2) - logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}") + limit = self.settings.BACKGROUND_LIMIT + logger.info(f"🚦 LLMService: Background Semaphore initialized with limit: {limit}") LLMService._background_semaphore = asyncio.Semaphore(limit) - # Ollama Setup - self.timeout = httpx.Timeout(self.settings.LLM_TIMEOUT, connect=10.0) + # 1. Lokaler Ollama Client self.ollama_client = httpx.AsyncClient( base_url=self.settings.OLLAMA_URL, - timeout=self.timeout + timeout=httpx.Timeout(self.settings.LLM_TIMEOUT) ) - # Gemini Setup [WP-20] - if hasattr(self.settings, "GOOGLE_API_KEY") and self.settings.GOOGLE_API_KEY: - genai.configure(api_key=self.settings.GOOGLE_API_KEY) - model_name = getattr(self.settings, "GEMINI_MODEL", "gemini-1.5-flash") - self.gemini_model = genai.GenerativeModel(model_name) - logger.info(f"✨ LLMService: Gemini Cloud Mode active ({model_name})") - else: - self.gemini_model = None - logger.warning("⚠️ LLMService: No GOOGLE_API_KEY found. Gemini mode disabled.") + # 2. Google GenAI Client (Modern SDK) + self.google_client = None + if self.settings.GOOGLE_API_KEY: + self.google_client = genai.Client(api_key=self.settings.GOOGLE_API_KEY) + logger.info("✨ LLMService: Google GenAI (Gemini) active.") + + # 3. OpenRouter Client + self.openrouter_client = None + if self.settings.OPENROUTER_API_KEY: + self.openrouter_client = AsyncOpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=self.settings.OPENROUTER_API_KEY + ) + logger.info("🛰️ LLMService: OpenRouter Integration active.") def _load_prompts(self) -> dict: - """Lädt die Prompt-Konfiguration aus der YAML-Datei.""" path = Path(self.settings.PROMPTS_PATH) if not path.exists(): return {} try: - with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) + with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} except Exception as e: logger.error(f"Failed to load prompts: {e}") return {} def get_prompt(self, key: str, provider: str = None) -> str: - """ - Wählt das Template basierend auf dem Provider aus (WP-20). - Unterstützt sowohl flache Strings als auch Dictionary-basierte Provider-Zweige. - """ - active_provider = provider or getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama") + """Hole provider-spezifisches Template mit Fallback-Kaskade.""" + active_provider = provider or self.settings.MINDNET_LLM_PROVIDER data = self.prompts.get(key, "") - if isinstance(data, dict): - # Versuche den Provider-Key, Fallback auf 'ollama' return data.get(active_provider, data.get("ollama", "")) return str(data) async def generate_raw_response( - self, - prompt: str, - system: str = None, - force_json: bool = False, - max_retries: int = 2, - base_delay: float = 2.0, + self, prompt: str, system: str = None, force_json: bool = False, + max_retries: int = 2, base_delay: float = 2.0, priority: Literal["realtime", "background"] = "realtime", - provider: Optional[str] = None + provider: Optional[str] = None, + model_override: Optional[str] = None ) -> str: - """ - Führt einen LLM Call aus mit Priority-Handling und Provider-Wahl. - """ - # Bestimme Provider: Parameter-Override > Config-Default - target_provider = provider or getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama") + """Einstiegspunkt mit Priority-Handling.""" + target_provider = provider or self.settings.MINDNET_LLM_PROVIDER - use_semaphore = (priority == "background") - - if use_semaphore and LLMService._background_semaphore: + if priority == "background": async with LLMService._background_semaphore: - return await self._dispatch_request(target_provider, prompt, system, force_json, max_retries, base_delay) - else: - return await self._dispatch_request(target_provider, prompt, system, force_json, max_retries, base_delay) + return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override) + return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override) - async def _dispatch_request(self, provider, prompt, system, force_json, max_retries, base_delay): - """Routet die Anfrage an den gewählten Provider mit Fallback-Logik.""" + async def _dispatch(self, provider, prompt, system, force_json, max_retries, base_delay, model_override): try: - if provider == "gemini" and self.gemini_model: - return await self._execute_gemini(prompt, system, force_json) - else: - return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) + if provider == "openrouter" and self.openrouter_client: + return await self._execute_openrouter(prompt, system, force_json, model_override) + if provider == "gemini" and self.google_client: + return await self._execute_google(prompt, system, force_json, model_override) + return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) except Exception as e: - # Automatischer Fallback auf Ollama bei Cloud-Fehlern (WP-20) - if provider == "gemini" and getattr(self.settings, "LLM_FALLBACK_ENABLED", True): - logger.warning(f"🔄 Gemini failed: {e}. Falling back to Ollama.") + if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama": + logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to Ollama.") return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) raise e - async def _execute_gemini(self, prompt, system, force_json) -> str: - """Asynchroner Google Gemini Call (WP-20).""" - full_prompt = f"System: {system}\n\nUser: {prompt}" if system else prompt - - # Gemini JSON Mode Support - gen_config = {} - if force_json: - gen_config["response_mime_type"] = "application/json" - - response = await self.gemini_model.generate_content_async( - full_prompt, - generation_config=gen_config + async def _execute_google(self, prompt, system, force_json, model_override): + """Native Google SDK Integration.""" + model = model_override or self.settings.GEMINI_MODEL + config = types.GenerateContentConfig( + system_instruction=system, + response_mime_type="application/json" if force_json else "text/plain" + ) + # Synchroner SDK-Call in Thread auslagern + response = await asyncio.to_thread( + self.google_client.models.generate_content, + model=model, contents=prompt, config=config ) return response.text.strip() - async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay) -> str: - """Ollama Call mit exponentieller Backoff-Retry-Logik.""" - payload: Dict[str, Any] = { - "model": self.settings.LLM_MODEL, - "prompt": prompt, - "stream": False, - "options": { - "temperature": 0.1 if force_json else 0.7, - "num_ctx": 8192 - } + async def _execute_openrouter(self, prompt, system, force_json, model_override): + """OpenRouter (OpenAI-kompatibel).""" + model = model_override or self.settings.OPENROUTER_MODEL + messages = [] + if system: messages.append({"role": "system", "content": system}) + messages.append({"role": "user", "content": prompt}) + + response = await self.openrouter_client.chat.completions.create( + model=model, + messages=messages, + response_format={"type": "json_object"} if force_json else None + ) + return response.choices[0].message.content.strip() + + async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay): + """Ollama mit exponentiellem Backoff.""" + payload = { + "model": self.settings.LLM_MODEL, "prompt": prompt, "stream": False, + "options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192} } if force_json: payload["format"] = "json" if system: payload["system"] = system @@ -145,41 +137,23 @@ class LLMService: attempt = 0 while True: try: - response = await self.ollama_client.post("/api/generate", json=payload) - if response.status_code == 200: - return response.json().get("response", "").strip() - response.raise_for_status() + res = await self.ollama_client.post("/api/generate", json=payload) + res.raise_for_status() + return res.json().get("response", "").strip() except Exception as e: attempt += 1 - if attempt > max_retries: - logger.error(f"Ollama Error after {attempt} retries: {e}") - raise e - # Exponentieller Backoff: base_delay * (2 ^ (attempt - 1)) - wait_time = base_delay * (2 ** (attempt - 1)) - logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...") - await asyncio.sleep(wait_time) + if attempt > max_retries: raise e + wait = base_delay * (2 ** (attempt - 1)) + logger.warning(f"⚠️ Ollama retry {attempt} in {wait}s...") + await asyncio.sleep(wait) async def generate_rag_response(self, query: str, context_str: str) -> str: - """Standard RAG Chat-Interface mit Provider-spezifischen Templates.""" - provider = getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama") - - # Holen der Templates über die neue get_prompt Methode - system_prompt = self.get_prompt("system_prompt", provider) - rag_template = self.get_prompt("rag_template", provider) - - # Fallback für RAG Template Struktur - if not rag_template: - rag_template = "{context_str}\n\n{query}" - - final_prompt = rag_template.format(context_str=context_str, query=query) - - return await self.generate_raw_response( - final_prompt, - system=system_prompt, - priority="realtime" - ) + """Vollständiger RAG-Wrapper.""" + provider = self.settings.MINDNET_LLM_PROVIDER + system = self.get_prompt("system_prompt", provider) + template = self.get_prompt("rag_template", provider) + final_prompt = template.format(context_str=context_str, query=query) + return await self.generate_raw_response(final_prompt, system=system, priority="realtime") async def close(self): - """Schließt alle offenen HTTP-Verbindungen.""" - if self.ollama_client: - await self.ollama_client.aclose() \ No newline at end of file + await self.ollama_client.aclose() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3e258e0..850ea6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,7 @@ streamlit-agraph>=0.0.45 st-cytoscape # Google gemini API -google-generativeai>=0.8.3 \ No newline at end of file +google-generativeai>=0.8.3 + +# OpenAi für OpenRouter +openai>=1.50.0 \ No newline at end of file