integration openrouter

This commit is contained in:
Lars 2025-12-23 15:09:47 +01:00
parent 0ac8a14ea7
commit 2a98c37ca1
3 changed files with 113 additions and 129 deletions

View File

@ -1,11 +1,9 @@
"""
FILE: app/config.py
DESCRIPTION: Zentrale Pydantic-Konfiguration (Env-Vars für Qdrant, LLM, Retriever).
Erweitert um WP-20 Hybrid-Optionen.
VERSION: 0.5.0
DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält alle Parameter für Qdrant,
lokale Embeddings, Ollama, Google GenAI und OpenRouter.
VERSION: 0.6.0 (WP-20 Full Hybrid Integration)
STATUS: Active
DEPENDENCIES: os, functools, pathlib
LAST_ANALYSIS: 2025-12-23
"""
from __future__ import annotations
import os
@ -13,38 +11,47 @@ from functools import lru_cache
from pathlib import Path
class Settings:
# Qdrant Verbindung
# --- Qdrant Datenbank ---
QDRANT_URL: str = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
QDRANT_API_KEY: str | None = os.getenv("QDRANT_API_KEY")
COLLECTION_PREFIX: str = os.getenv("MINDNET_PREFIX", "mindnet")
VECTOR_SIZE: int = int(os.getenv("MINDNET_VECTOR_SIZE", "384"))
DISTANCE: str = os.getenv("MINDNET_DISTANCE", "Cosine")
# Embeddings (lokal)
# --- Lokale Embeddings ---
MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
# WP-20 Hybrid LLM Provider
# Erlaubt: "ollama" oder "gemini"
# --- WP-20 Cloud Hybrid Mode (Google GenAI & OpenRouter) ---
# Erlaubt: "ollama" | "gemini" | "openrouter"
MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower()
# Google AI Studio (Direkt)
GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY")
GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash")
GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") # Für Ingestion-Speed
# OpenRouter Integration
OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
LLM_FALLBACK_ENABLED: bool = os.getenv("MINDNET_LLM_FALLBACK", "true").lower() == "true"
# WP-05 LLM / Ollama (Local)
# --- WP-05 Lokales LLM (Ollama) ---
OLLAMA_URL: str = os.getenv("MINDNET_OLLAMA_URL", "http://127.0.0.1:11434")
LLM_MODEL: str = os.getenv("MINDNET_LLM_MODEL", "phi3:mini")
PROMPTS_PATH: str = os.getenv("MINDNET_PROMPTS_PATH", "config/prompts.yaml")
# WP-06 / WP-14 Performance & Timeouts
# --- WP-06 / WP-14 Performance & Last-Steuerung ---
LLM_TIMEOUT: float = float(os.getenv("MINDNET_LLM_TIMEOUT", "120.0"))
DECISION_CONFIG_PATH: str = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml")
BACKGROUND_LIMIT: int = int(os.getenv("MINDNET_LLM_BACKGROUND_LIMIT", "2"))
# API & Debugging
# --- System-Pfade ---
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault")
MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
# WP-04 Retriever Gewichte (Semantik vs. Graph)
# --- WP-04 Retriever Gewichte (Semantik vs. Graph) ---
RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70"))
RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25"))
RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05"))

View File

@ -1,20 +1,17 @@
"""
FILE: app/services/llm_service.py
DESCRIPTION: Hybrid-Client für Ollama & Google Gemini.
Verwaltet Prompts, Background-Last (Semaphore) und Cloud-Routing.
VERSION: 3.1.0 (WP-20 Full Integration: Provider-Aware Prompting)
STATUS: Active
DEPENDENCIES: httpx, yaml, asyncio, google-generativeai, app.config
EXTERNAL_CONFIG: config/prompts.yaml
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI und OpenRouter.
Verwaltet provider-spezifische Prompts und Background-Last.
VERSION: 3.3.0 (Full SDK Integration)
"""
import httpx
import yaml
import logging
import os
import asyncio
import json
import google.generativeai as genai
from google import genai
from google.genai import types
from openai import AsyncOpenAI # Für OpenRouter
from pathlib import Path
from typing import Optional, Dict, Any, Literal
from app.config import get_settings
@ -22,122 +19,117 @@ from app.config import get_settings
logger = logging.getLogger(__name__)
class LLMService:
# GLOBALER SEMAPHOR für Hintergrund-Last Steuerung (WP-06 / WP-20)
_background_semaphore = None
def __init__(self):
self.settings = get_settings()
self.prompts = self._load_prompts()
# Initialisiere Semaphore einmalig auf Klassen-Ebene
# WP-06: Semaphore-Initialisierung
if LLMService._background_semaphore is None:
limit = getattr(self.settings, "BACKGROUND_LIMIT", 2)
logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}")
limit = self.settings.BACKGROUND_LIMIT
logger.info(f"🚦 LLMService: Background Semaphore initialized with limit: {limit}")
LLMService._background_semaphore = asyncio.Semaphore(limit)
# Ollama Setup
self.timeout = httpx.Timeout(self.settings.LLM_TIMEOUT, connect=10.0)
# 1. Lokaler Ollama Client
self.ollama_client = httpx.AsyncClient(
base_url=self.settings.OLLAMA_URL,
timeout=self.timeout
timeout=httpx.Timeout(self.settings.LLM_TIMEOUT)
)
# Gemini Setup [WP-20]
if hasattr(self.settings, "GOOGLE_API_KEY") and self.settings.GOOGLE_API_KEY:
genai.configure(api_key=self.settings.GOOGLE_API_KEY)
model_name = getattr(self.settings, "GEMINI_MODEL", "gemini-1.5-flash")
self.gemini_model = genai.GenerativeModel(model_name)
logger.info(f"✨ LLMService: Gemini Cloud Mode active ({model_name})")
else:
self.gemini_model = None
logger.warning("⚠️ LLMService: No GOOGLE_API_KEY found. Gemini mode disabled.")
# 2. Google GenAI Client (Modern SDK)
self.google_client = None
if self.settings.GOOGLE_API_KEY:
self.google_client = genai.Client(api_key=self.settings.GOOGLE_API_KEY)
logger.info("✨ LLMService: Google GenAI (Gemini) active.")
# 3. OpenRouter Client
self.openrouter_client = None
if self.settings.OPENROUTER_API_KEY:
self.openrouter_client = AsyncOpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=self.settings.OPENROUTER_API_KEY
)
logger.info("🛰️ LLMService: OpenRouter Integration active.")
def _load_prompts(self) -> dict:
"""Lädt die Prompt-Konfiguration aus der YAML-Datei."""
path = Path(self.settings.PROMPTS_PATH)
if not path.exists(): return {}
try:
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f)
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
except Exception as e:
logger.error(f"Failed to load prompts: {e}")
return {}
def get_prompt(self, key: str, provider: str = None) -> str:
"""
Wählt das Template basierend auf dem Provider aus (WP-20).
Unterstützt sowohl flache Strings als auch Dictionary-basierte Provider-Zweige.
"""
active_provider = provider or getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama")
"""Hole provider-spezifisches Template mit Fallback-Kaskade."""
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
data = self.prompts.get(key, "")
if isinstance(data, dict):
# Versuche den Provider-Key, Fallback auf 'ollama'
return data.get(active_provider, data.get("ollama", ""))
return str(data)
async def generate_raw_response(
self,
prompt: str,
system: str = None,
force_json: bool = False,
max_retries: int = 2,
base_delay: float = 2.0,
self, prompt: str, system: str = None, force_json: bool = False,
max_retries: int = 2, base_delay: float = 2.0,
priority: Literal["realtime", "background"] = "realtime",
provider: Optional[str] = None
provider: Optional[str] = None,
model_override: Optional[str] = None
) -> str:
"""
Führt einen LLM Call aus mit Priority-Handling und Provider-Wahl.
"""
# Bestimme Provider: Parameter-Override > Config-Default
target_provider = provider or getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama")
"""Einstiegspunkt mit Priority-Handling."""
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
use_semaphore = (priority == "background")
if use_semaphore and LLMService._background_semaphore:
if priority == "background":
async with LLMService._background_semaphore:
return await self._dispatch_request(target_provider, prompt, system, force_json, max_retries, base_delay)
else:
return await self._dispatch_request(target_provider, prompt, system, force_json, max_retries, base_delay)
return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
async def _dispatch_request(self, provider, prompt, system, force_json, max_retries, base_delay):
"""Routet die Anfrage an den gewählten Provider mit Fallback-Logik."""
async def _dispatch(self, provider, prompt, system, force_json, max_retries, base_delay, model_override):
try:
if provider == "gemini" and self.gemini_model:
return await self._execute_gemini(prompt, system, force_json)
else:
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
if provider == "openrouter" and self.openrouter_client:
return await self._execute_openrouter(prompt, system, force_json, model_override)
if provider == "gemini" and self.google_client:
return await self._execute_google(prompt, system, force_json, model_override)
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
except Exception as e:
# Automatischer Fallback auf Ollama bei Cloud-Fehlern (WP-20)
if provider == "gemini" and getattr(self.settings, "LLM_FALLBACK_ENABLED", True):
logger.warning(f"🔄 Gemini failed: {e}. Falling back to Ollama.")
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to Ollama.")
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
raise e
async def _execute_gemini(self, prompt, system, force_json) -> str:
"""Asynchroner Google Gemini Call (WP-20)."""
full_prompt = f"System: {system}\n\nUser: {prompt}" if system else prompt
# Gemini JSON Mode Support
gen_config = {}
if force_json:
gen_config["response_mime_type"] = "application/json"
response = await self.gemini_model.generate_content_async(
full_prompt,
generation_config=gen_config
async def _execute_google(self, prompt, system, force_json, model_override):
"""Native Google SDK Integration."""
model = model_override or self.settings.GEMINI_MODEL
config = types.GenerateContentConfig(
system_instruction=system,
response_mime_type="application/json" if force_json else "text/plain"
)
# Synchroner SDK-Call in Thread auslagern
response = await asyncio.to_thread(
self.google_client.models.generate_content,
model=model, contents=prompt, config=config
)
return response.text.strip()
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay) -> str:
"""Ollama Call mit exponentieller Backoff-Retry-Logik."""
payload: Dict[str, Any] = {
"model": self.settings.LLM_MODEL,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1 if force_json else 0.7,
"num_ctx": 8192
}
async def _execute_openrouter(self, prompt, system, force_json, model_override):
"""OpenRouter (OpenAI-kompatibel)."""
model = model_override or self.settings.OPENROUTER_MODEL
messages = []
if system: messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
response = await self.openrouter_client.chat.completions.create(
model=model,
messages=messages,
response_format={"type": "json_object"} if force_json else None
)
return response.choices[0].message.content.strip()
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
"""Ollama mit exponentiellem Backoff."""
payload = {
"model": self.settings.LLM_MODEL, "prompt": prompt, "stream": False,
"options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
}
if force_json: payload["format"] = "json"
if system: payload["system"] = system
@ -145,41 +137,23 @@ class LLMService:
attempt = 0
while True:
try:
response = await self.ollama_client.post("/api/generate", json=payload)
if response.status_code == 200:
return response.json().get("response", "").strip()
response.raise_for_status()
res = await self.ollama_client.post("/api/generate", json=payload)
res.raise_for_status()
return res.json().get("response", "").strip()
except Exception as e:
attempt += 1
if attempt > max_retries:
logger.error(f"Ollama Error after {attempt} retries: {e}")
raise e
# Exponentieller Backoff: base_delay * (2 ^ (attempt - 1))
wait_time = base_delay * (2 ** (attempt - 1))
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
if attempt > max_retries: raise e
wait = base_delay * (2 ** (attempt - 1))
logger.warning(f"⚠️ Ollama retry {attempt} in {wait}s...")
await asyncio.sleep(wait)
async def generate_rag_response(self, query: str, context_str: str) -> str:
"""Standard RAG Chat-Interface mit Provider-spezifischen Templates."""
provider = getattr(self.settings, "MINDNET_LLM_PROVIDER", "ollama")
# Holen der Templates über die neue get_prompt Methode
system_prompt = self.get_prompt("system_prompt", provider)
rag_template = self.get_prompt("rag_template", provider)
# Fallback für RAG Template Struktur
if not rag_template:
rag_template = "{context_str}\n\n{query}"
final_prompt = rag_template.format(context_str=context_str, query=query)
return await self.generate_raw_response(
final_prompt,
system=system_prompt,
priority="realtime"
)
"""Vollständiger RAG-Wrapper."""
provider = self.settings.MINDNET_LLM_PROVIDER
system = self.get_prompt("system_prompt", provider)
template = self.get_prompt("rag_template", provider)
final_prompt = template.format(context_str=context_str, query=query)
return await self.generate_raw_response(final_prompt, system=system, priority="realtime")
async def close(self):
"""Schließt alle offenen HTTP-Verbindungen."""
if self.ollama_client:
await self.ollama_client.aclose()
await self.ollama_client.aclose()

View File

@ -37,4 +37,7 @@ streamlit-agraph>=0.0.45
st-cytoscape
# Google gemini API
google-generativeai>=0.8.3
google-generativeai>=0.8.3
# OpenAi für OpenRouter
openai>=1.50.0