integration openrouter

This commit is contained in:
Lars 2025-12-23 15:09:47 +01:00
parent 0ac8a14ea7
commit 2a98c37ca1
3 changed files with 113 additions and 129 deletions

View File

@ -1,11 +1,9 @@
""" """
FILE: app/config.py FILE: app/config.py
DESCRIPTION: Zentrale Pydantic-Konfiguration (Env-Vars für Qdrant, LLM, Retriever). DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält alle Parameter für Qdrant,
Erweitert um WP-20 Hybrid-Optionen. lokale Embeddings, Ollama, Google GenAI und OpenRouter.
VERSION: 0.5.0 VERSION: 0.6.0 (WP-20 Full Hybrid Integration)
STATUS: Active STATUS: Active
DEPENDENCIES: os, functools, pathlib
LAST_ANALYSIS: 2025-12-23
""" """
from __future__ import annotations from __future__ import annotations
import os import os
@ -13,38 +11,47 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
class Settings: class Settings:
# Qdrant Verbindung # --- Qdrant Datenbank ---
QDRANT_URL: str = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") QDRANT_URL: str = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
QDRANT_API_KEY: str | None = os.getenv("QDRANT_API_KEY") QDRANT_API_KEY: str | None = os.getenv("QDRANT_API_KEY")
COLLECTION_PREFIX: str = os.getenv("MINDNET_PREFIX", "mindnet") COLLECTION_PREFIX: str = os.getenv("MINDNET_PREFIX", "mindnet")
VECTOR_SIZE: int = int(os.getenv("MINDNET_VECTOR_SIZE", "384")) VECTOR_SIZE: int = int(os.getenv("MINDNET_VECTOR_SIZE", "384"))
DISTANCE: str = os.getenv("MINDNET_DISTANCE", "Cosine") DISTANCE: str = os.getenv("MINDNET_DISTANCE", "Cosine")
# Embeddings (lokal) # --- Lokale Embeddings ---
MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2") MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
# WP-20 Hybrid LLM Provider # --- WP-20 Cloud Hybrid Mode (Google GenAI & OpenRouter) ---
# Erlaubt: "ollama" oder "gemini" # Erlaubt: "ollama" | "gemini" | "openrouter"
MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower() MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower()
# Google AI Studio (Direkt)
GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY") GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY")
GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash") GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash")
GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") # Für Ingestion-Speed
# OpenRouter Integration
OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
LLM_FALLBACK_ENABLED: bool = os.getenv("MINDNET_LLM_FALLBACK", "true").lower() == "true" LLM_FALLBACK_ENABLED: bool = os.getenv("MINDNET_LLM_FALLBACK", "true").lower() == "true"
# WP-05 LLM / Ollama (Local) # --- WP-05 Lokales LLM (Ollama) ---
OLLAMA_URL: str = os.getenv("MINDNET_OLLAMA_URL", "http://127.0.0.1:11434") OLLAMA_URL: str = os.getenv("MINDNET_OLLAMA_URL", "http://127.0.0.1:11434")
LLM_MODEL: str = os.getenv("MINDNET_LLM_MODEL", "phi3:mini") LLM_MODEL: str = os.getenv("MINDNET_LLM_MODEL", "phi3:mini")
PROMPTS_PATH: str = os.getenv("MINDNET_PROMPTS_PATH", "config/prompts.yaml") PROMPTS_PATH: str = os.getenv("MINDNET_PROMPTS_PATH", "config/prompts.yaml")
# WP-06 / WP-14 Performance & Timeouts # --- WP-06 / WP-14 Performance & Last-Steuerung ---
LLM_TIMEOUT: float = float(os.getenv("MINDNET_LLM_TIMEOUT", "120.0")) LLM_TIMEOUT: float = float(os.getenv("MINDNET_LLM_TIMEOUT", "120.0"))
DECISION_CONFIG_PATH: str = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml") DECISION_CONFIG_PATH: str = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml")
BACKGROUND_LIMIT: int = int(os.getenv("MINDNET_LLM_BACKGROUND_LIMIT", "2")) BACKGROUND_LIMIT: int = int(os.getenv("MINDNET_LLM_BACKGROUND_LIMIT", "2"))
# API & Debugging # --- System-Pfade ---
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true" DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault") MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault")
MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
# WP-04 Retriever Gewichte (Semantik vs. Graph) # --- WP-04 Retriever Gewichte (Semantik vs. Graph) ---
RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70")) RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70"))
RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25")) RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25"))
RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05")) RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05"))

View File

@ -1,20 +1,17 @@
""" """
FILE: app/services/llm_service.py FILE: app/services/llm_service.py
DESCRIPTION: Hybrid-Client für Ollama & Google Gemini. DESCRIPTION: Hybrid-Client für Ollama, Google GenAI und OpenRouter.
Verwaltet Prompts, Background-Last (Semaphore) und Cloud-Routing. Verwaltet provider-spezifische Prompts und Background-Last.
VERSION: 3.1.0 (WP-20 Full Integration: Provider-Aware Prompting) VERSION: 3.3.0 (Full SDK Integration)
STATUS: Active
DEPENDENCIES: httpx, yaml, asyncio, google-generativeai, app.config
EXTERNAL_CONFIG: config/prompts.yaml
""" """
import httpx import httpx
import yaml import yaml
import logging import logging
import os
import asyncio import asyncio
import json import json
import google.generativeai as genai from google import genai
from google.genai import types
from openai import AsyncOpenAI # Für OpenRouter
from pathlib import Path from pathlib import Path
from typing import Optional, Dict, Any, Literal from typing import Optional, Dict, Any, Literal
from app.config import get_settings from app.config import get_settings
@ -22,122 +19,117 @@ from app.config import get_settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class LLMService: class LLMService:
# GLOBALER SEMAPHOR für Hintergrund-Last Steuerung (WP-06 / WP-20)
_background_semaphore = None _background_semaphore = None
def __init__(self): def __init__(self):
self.settings = get_settings() self.settings = get_settings()
self.prompts = self._load_prompts() self.prompts = self._load_prompts()
# Initialisiere Semaphore einmalig auf Klassen-Ebene # WP-06: Semaphore-Initialisierung
if LLMService._background_semaphore is None: if LLMService._background_semaphore is None:
limit = getattr(self.settings, "BACKGROUND_LIMIT", 2) limit = self.settings.BACKGROUND_LIMIT
logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}") logger.info(f"🚦 LLMService: Background Semaphore initialized with limit: {limit}")
LLMService._background_semaphore = asyncio.Semaphore(limit) LLMService._background_semaphore = asyncio.Semaphore(limit)
# Ollama Setup # 1. Lokaler Ollama Client
self.timeout = httpx.Timeout(self.settings.LLM_TIMEOUT, connect=10.0)
self.ollama_client = httpx.AsyncClient( self.ollama_client = httpx.AsyncClient(
base_url=self.settings.OLLAMA_URL, base_url=self.settings.OLLAMA_URL,
timeout=self.timeout timeout=httpx.Timeout(self.settings.LLM_TIMEOUT)
) )
# Gemini Setup [WP-20] # 2. Google GenAI Client (Modern SDK)
if hasattr(self.settings, "GOOGLE_API_KEY") and self.settings.GOOGLE_API_KEY: self.google_client = None
genai.configure(api_key=self.settings.GOOGLE_API_KEY) if self.settings.GOOGLE_API_KEY:
model_name = getattr(self.settings, "GEMINI_MODEL", "gemini-1.5-flash") self.google_client = genai.Client(api_key=self.settings.GOOGLE_API_KEY)
self.gemini_model = genai.GenerativeModel(model_name) logger.info("✨ LLMService: Google GenAI (Gemini) active.")
logger.info(f"✨ LLMService: Gemini Cloud Mode active ({model_name})")
else: # 3. OpenRouter Client
self.gemini_model = None self.openrouter_client = None
logger.warning("⚠️ LLMService: No GOOGLE_API_KEY found. Gemini mode disabled.") if self.settings.OPENROUTER_API_KEY:
self.openrouter_client = AsyncOpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=self.settings.OPENROUTER_API_KEY
)
logger.info("🛰️ LLMService: OpenRouter Integration active.")
def _load_prompts(self) -> dict: def _load_prompts(self) -> dict:
"""Lädt die Prompt-Konfiguration aus der YAML-Datei."""
path = Path(self.settings.PROMPTS_PATH) path = Path(self.settings.PROMPTS_PATH)
if not path.exists(): return {} if not path.exists(): return {}
try: try:
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
except Exception as e: except Exception as e:
logger.error(f"Failed to load prompts: {e}") logger.error(f"Failed to load prompts: {e}")
return {} return {}
def get_prompt(self, key: str, provider: str = None) -> str: def get_prompt(self, key: str, provider: str = None) -> str:
""" """Hole provider-spezifisches Template mit Fallback-Kaskade."""
Wählt das Template basierend auf dem Provider aus (WP-20). active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
Unterstützt sowohl flache Strings als auch Dictionary-basierte Provider-Zweige.
"""
active_provider = provider or getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama")
data = self.prompts.get(key, "") data = self.prompts.get(key, "")
if isinstance(data, dict): if isinstance(data, dict):
# Versuche den Provider-Key, Fallback auf 'ollama'
return data.get(active_provider, data.get("ollama", "")) return data.get(active_provider, data.get("ollama", ""))
return str(data) return str(data)
async def generate_raw_response( async def generate_raw_response(
self, self, prompt: str, system: str = None, force_json: bool = False,
prompt: str, max_retries: int = 2, base_delay: float = 2.0,
system: str = None,
force_json: bool = False,
max_retries: int = 2,
base_delay: float = 2.0,
priority: Literal["realtime", "background"] = "realtime", priority: Literal["realtime", "background"] = "realtime",
provider: Optional[str] = None provider: Optional[str] = None,
model_override: Optional[str] = None
) -> str: ) -> str:
""" """Einstiegspunkt mit Priority-Handling."""
Führt einen LLM Call aus mit Priority-Handling und Provider-Wahl. target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
"""
# Bestimme Provider: Parameter-Override > Config-Default
target_provider = provider or getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama")
use_semaphore = (priority == "background") if priority == "background":
if use_semaphore and LLMService._background_semaphore:
async with LLMService._background_semaphore: async with LLMService._background_semaphore:
return await self._dispatch_request(target_provider, prompt, system, force_json, max_retries, base_delay) return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
else: return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
return await self._dispatch_request(target_provider, prompt, system, force_json, max_retries, base_delay)
async def _dispatch_request(self, provider, prompt, system, force_json, max_retries, base_delay): async def _dispatch(self, provider, prompt, system, force_json, max_retries, base_delay, model_override):
"""Routet die Anfrage an den gewählten Provider mit Fallback-Logik."""
try: try:
if provider == "gemini" and self.gemini_model: if provider == "openrouter" and self.openrouter_client:
return await self._execute_gemini(prompt, system, force_json) return await self._execute_openrouter(prompt, system, force_json, model_override)
else: if provider == "gemini" and self.google_client:
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) return await self._execute_google(prompt, system, force_json, model_override)
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
except Exception as e: except Exception as e:
# Automatischer Fallback auf Ollama bei Cloud-Fehlern (WP-20) if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
if provider == "gemini" and getattr(self.settings, "LLM_FALLBACK_ENABLED", True): logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to Ollama.")
logger.warning(f"🔄 Gemini failed: {e}. Falling back to Ollama.")
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
raise e raise e
async def _execute_gemini(self, prompt, system, force_json) -> str: async def _execute_google(self, prompt, system, force_json, model_override):
"""Asynchroner Google Gemini Call (WP-20).""" """Native Google SDK Integration."""
full_prompt = f"System: {system}\n\nUser: {prompt}" if system else prompt model = model_override or self.settings.GEMINI_MODEL
config = types.GenerateContentConfig(
# Gemini JSON Mode Support system_instruction=system,
gen_config = {} response_mime_type="application/json" if force_json else "text/plain"
if force_json: )
gen_config["response_mime_type"] = "application/json" # Synchroner SDK-Call in Thread auslagern
response = await asyncio.to_thread(
response = await self.gemini_model.generate_content_async( self.google_client.models.generate_content,
full_prompt, model=model, contents=prompt, config=config
generation_config=gen_config
) )
return response.text.strip() return response.text.strip()
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay) -> str: async def _execute_openrouter(self, prompt, system, force_json, model_override):
"""Ollama Call mit exponentieller Backoff-Retry-Logik.""" """OpenRouter (OpenAI-kompatibel)."""
payload: Dict[str, Any] = { model = model_override or self.settings.OPENROUTER_MODEL
"model": self.settings.LLM_MODEL, messages = []
"prompt": prompt, if system: messages.append({"role": "system", "content": system})
"stream": False, messages.append({"role": "user", "content": prompt})
"options": {
"temperature": 0.1 if force_json else 0.7, response = await self.openrouter_client.chat.completions.create(
"num_ctx": 8192 model=model,
} messages=messages,
response_format={"type": "json_object"} if force_json else None
)
return response.choices[0].message.content.strip()
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
"""Ollama mit exponentiellem Backoff."""
payload = {
"model": self.settings.LLM_MODEL, "prompt": prompt, "stream": False,
"options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
} }
if force_json: payload["format"] = "json" if force_json: payload["format"] = "json"
if system: payload["system"] = system if system: payload["system"] = system
@ -145,41 +137,23 @@ class LLMService:
attempt = 0 attempt = 0
while True: while True:
try: try:
response = await self.ollama_client.post("/api/generate", json=payload) res = await self.ollama_client.post("/api/generate", json=payload)
if response.status_code == 200: res.raise_for_status()
return response.json().get("response", "").strip() return res.json().get("response", "").strip()
response.raise_for_status()
except Exception as e: except Exception as e:
attempt += 1 attempt += 1
if attempt > max_retries: if attempt > max_retries: raise e
logger.error(f"Ollama Error after {attempt} retries: {e}") wait = base_delay * (2 ** (attempt - 1))
raise e logger.warning(f"⚠️ Ollama retry {attempt} in {wait}s...")
# Exponentieller Backoff: base_delay * (2 ^ (attempt - 1)) await asyncio.sleep(wait)
wait_time = base_delay * (2 ** (attempt - 1))
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
async def generate_rag_response(self, query: str, context_str: str) -> str: async def generate_rag_response(self, query: str, context_str: str) -> str:
"""Standard RAG Chat-Interface mit Provider-spezifischen Templates.""" """Vollständiger RAG-Wrapper."""
provider = getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama") provider = self.settings.MINDNET_LLM_PROVIDER
system = self.get_prompt("system_prompt", provider)
# Holen der Templates über die neue get_prompt Methode template = self.get_prompt("rag_template", provider)
system_prompt = self.get_prompt("system_prompt", provider) final_prompt = template.format(context_str=context_str, query=query)
rag_template = self.get_prompt("rag_template", provider) return await self.generate_raw_response(final_prompt, system=system, priority="realtime")
# Fallback für RAG Template Struktur
if not rag_template:
rag_template = "{context_str}\n\n{query}"
final_prompt = rag_template.format(context_str=context_str, query=query)
return await self.generate_raw_response(
final_prompt,
system=system_prompt,
priority="realtime"
)
async def close(self): async def close(self):
"""Schließt alle offenen HTTP-Verbindungen.""" await self.ollama_client.aclose()
if self.ollama_client:
await self.ollama_client.aclose()

View File

@ -37,4 +37,7 @@ streamlit-agraph>=0.0.45
st-cytoscape st-cytoscape
# Google gemini API # Google gemini API
google-generativeai>=0.8.3 google-generativeai>=0.8.3
# OpenAi für OpenRouter
openai>=1.50.0