komplette openrouter integration
This commit is contained in:
parent
c60aba63a4
commit
36fb27edf0
|
|
@ -1,8 +1,8 @@
|
|||
"""
|
||||
FILE: app/config.py
|
||||
DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält alle Parameter für Qdrant,
|
||||
lokale Embeddings, Ollama, Google GenAI und OpenRouter.
|
||||
VERSION: 0.6.0 (WP-20 Full Hybrid Integration)
|
||||
DESCRIPTION: Zentrale Pydantic-Konfiguration. Enthält Parameter für Qdrant,
|
||||
Embeddings, Ollama, Google GenAI und OpenRouter.
|
||||
VERSION: 0.6.0 (WP-20 Hybrid & OpenRouter Integration)
|
||||
STATUS: Active
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
|
@ -21,16 +21,16 @@ class Settings:
|
|||
# --- Lokale Embeddings ---
|
||||
MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
||||
|
||||
# --- WP-20 Cloud Hybrid Mode (Google GenAI & OpenRouter) ---
|
||||
# Erlaubt: "ollama" | "gemini" | "openrouter"
|
||||
# --- WP-20 Hybrid LLM Provider ---
|
||||
# Optionen: "ollama" | "gemini" | "openrouter"
|
||||
MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "ollama").lower()
|
||||
|
||||
# Google AI Studio (Direkt)
|
||||
# Google AI Studio
|
||||
GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY")
|
||||
GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-1.5-flash")
|
||||
GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it") # Für Ingestion-Speed
|
||||
GEMMA_MODEL: str = os.getenv("MINDNET_GEMMA_MODEL", "gemma2-9b-it")
|
||||
|
||||
# OpenRouter Integration
|
||||
# OpenRouter
|
||||
OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY")
|
||||
OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
|
||||
|
||||
|
|
@ -51,7 +51,7 @@ class Settings:
|
|||
MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault")
|
||||
MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
|
||||
|
||||
# --- WP-04 Retriever Gewichte (Semantik vs. Graph) ---
|
||||
# --- WP-04 Retriever Gewichte ---
|
||||
RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70"))
|
||||
RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25"))
|
||||
RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05"))
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes
|
|||
WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation.
|
||||
WP-22: Kontextsensitive Kanten-Validierung mit Fundort-Reporting (Zeilennummern).
|
||||
WP-22: Multi-Hash Refresh für konsistente Change Detection.
|
||||
VERSION: 2.11.0 (WP-20 Full Integration: Hybrid Smart Edges)
|
||||
VERSION: 2.11.1 (WP-20 Quota Protection: OpenRouter Priority)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry
|
||||
EXTERNAL_CONFIG: config/types.yaml, config/prompts.yaml
|
||||
|
|
@ -111,7 +111,7 @@ class IngestionService:
|
|||
self.dim = self.cfg.dim if hasattr(self.cfg, 'dim') else self.settings.VECTOR_SIZE
|
||||
self.registry = load_type_registry()
|
||||
self.embedder = EmbeddingsClient()
|
||||
self.llm = LLMService() # WP-20 Integration
|
||||
self.llm = LLMService()
|
||||
|
||||
# Change Detection Modus (full oder body)
|
||||
self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
|
||||
|
|
@ -135,32 +135,36 @@ class IngestionService:
|
|||
async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]:
|
||||
"""
|
||||
WP-20: Nutzt den Hybrid LLM Service für die semantische Kanten-Extraktion.
|
||||
Verwendet provider-spezifische Prompts aus der config.
|
||||
QUOTEN-SCHUTZ: Priorisiert OpenRouter (Gemma), um Gemini-Tageslimits zu schonen.
|
||||
"""
|
||||
# Wir priorisieren Gemma für Ingestion, falls verfügbar (OpenRouter/Cloud)
|
||||
model = getattr(self.settings, "GEMMA_MODEL", None)
|
||||
provider = self.settings.MINDNET_LLM_PROVIDER
|
||||
# Bestimme den Provider für die Ingestion (OpenRouter bevorzugt, falls Key vorhanden)
|
||||
provider = "openrouter" if getattr(self.settings, "OPENROUTER_API_KEY", None) else self.settings.MINDNET_LLM_PROVIDER
|
||||
|
||||
template = self.llm.get_prompt("edge_extraction")
|
||||
# Nutze Gemma-Modell für hohe Ingestion-Quoten (14.4K RPD) via OpenRouter oder Google
|
||||
model = getattr(self.settings, "GEMMA_MODEL", None)
|
||||
|
||||
# Hole Prompt aus der YAML (Kaskade: Provider -> gemini -> ollama)
|
||||
template = self.llm.get_prompt("edge_extraction", provider)
|
||||
prompt = template.format(text=text[:6000], note_id=note_id)
|
||||
|
||||
try:
|
||||
# Hintergrund-Task mit Semaphore
|
||||
# Hintergrund-Task mit Semaphore via LLMService
|
||||
response_json = await self.llm.generate_raw_response(
|
||||
prompt=prompt,
|
||||
priority="background",
|
||||
force_json=True,
|
||||
provider=provider,
|
||||
model_override=model
|
||||
)
|
||||
data = json.loads(response_json)
|
||||
|
||||
# Provenance für die EdgeRegistry
|
||||
# Provenance für die EdgeRegistry Dokumentation
|
||||
for item in data:
|
||||
item["provenance"] = "semantic_ai"
|
||||
item["line"] = f"ai-{provider}"
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning(f"Smart Edge Allocation skipped for {note_id}: {e}")
|
||||
logger.warning(f"Smart Edge Allocation failed for {note_id} on {provider}: {e}")
|
||||
return []
|
||||
|
||||
async def process_file(
|
||||
|
|
@ -214,7 +218,7 @@ class IngestionService:
|
|||
logger.error(f"Payload build failed: {e}")
|
||||
return {**result, "error": f"Payload build failed: {str(e)}"}
|
||||
|
||||
# 4. Change Detection
|
||||
# 4. Change Detection (Multi-Hash)
|
||||
old_payload = None
|
||||
if not force_replace:
|
||||
old_payload = self._fetch_note_payload(note_id)
|
||||
|
|
@ -255,7 +259,7 @@ class IngestionService:
|
|||
edges = []
|
||||
context = {"file": file_path, "note_id": note_id}
|
||||
|
||||
# A. Explizite User-Kanten
|
||||
# A. Explizite User-Kanten (Wiki-Links)
|
||||
explicit_edges = extract_edges_with_context(parsed)
|
||||
for e in explicit_edges:
|
||||
e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")})
|
||||
|
|
@ -267,7 +271,7 @@ class IngestionService:
|
|||
e["kind"] = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")})
|
||||
edges.append(e)
|
||||
|
||||
# C. System-Kanten
|
||||
# C. System-Kanten (Graph-Struktur)
|
||||
try:
|
||||
raw_system_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs)
|
||||
except TypeError:
|
||||
|
|
@ -282,7 +286,7 @@ class IngestionService:
|
|||
logger.error(f"Processing failed: {e}", exc_info=True)
|
||||
return {**result, "error": f"Processing failed: {str(e)}"}
|
||||
|
||||
# 6. Upsert
|
||||
# 6. Upsert in Qdrant
|
||||
try:
|
||||
if purge_before and has_old: self._purge_artifacts(note_id)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,10 @@
|
|||
"""
|
||||
FILE: app/services/llm_service.py
|
||||
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI und OpenRouter.
|
||||
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
|
||||
Verwaltet provider-spezifische Prompts und Background-Last.
|
||||
VERSION: 3.3.0 (Full SDK Integration)
|
||||
WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
|
||||
VERSION: 3.3.1
|
||||
STATUS: Active
|
||||
"""
|
||||
import httpx
|
||||
import yaml
|
||||
|
|
@ -11,7 +13,7 @@ import asyncio
|
|||
import json
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
from openai import AsyncOpenAI # Für OpenRouter
|
||||
from openai import AsyncOpenAI # Für OpenRouter (OpenAI-kompatibel)
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, Literal
|
||||
from app.config import get_settings
|
||||
|
|
@ -19,16 +21,17 @@ from app.config import get_settings
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
class LLMService:
|
||||
# GLOBALER SEMAPHOR für Hintergrund-Last Steuerung (WP-06)
|
||||
_background_semaphore = None
|
||||
|
||||
def __init__(self):
|
||||
self.settings = get_settings()
|
||||
self.prompts = self._load_prompts()
|
||||
|
||||
# WP-06: Semaphore-Initialisierung
|
||||
# Initialisiere Semaphore einmalig auf Klassen-Ebene
|
||||
if LLMService._background_semaphore is None:
|
||||
limit = self.settings.BACKGROUND_LIMIT
|
||||
logger.info(f"🚦 LLMService: Background Semaphore initialized with limit: {limit}")
|
||||
limit = getattr(self.settings, "BACKGROUND_LIMIT", 2)
|
||||
logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}")
|
||||
LLMService._background_semaphore = asyncio.Semaphore(limit)
|
||||
|
||||
# 1. Lokaler Ollama Client
|
||||
|
|
@ -53,6 +56,7 @@ class LLMService:
|
|||
logger.info("🛰️ LLMService: OpenRouter Integration active.")
|
||||
|
||||
def _load_prompts(self) -> dict:
|
||||
"""Lädt die Prompt-Konfiguration aus der YAML-Datei."""
|
||||
path = Path(self.settings.PROMPTS_PATH)
|
||||
if not path.exists(): return {}
|
||||
try:
|
||||
|
|
@ -62,11 +66,16 @@ class LLMService:
|
|||
return {}
|
||||
|
||||
def get_prompt(self, key: str, provider: str = None) -> str:
|
||||
"""Hole provider-spezifisches Template mit Fallback-Kaskade."""
|
||||
"""
|
||||
Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
|
||||
HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent.
|
||||
Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil).
|
||||
"""
|
||||
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||
data = self.prompts.get(key, "")
|
||||
if isinstance(data, dict):
|
||||
return data.get(active_provider, data.get("ollama", ""))
|
||||
# Wir versuchen erst den Provider, dann Gemini (weil ähnlich leistungsfähig), dann Ollama
|
||||
return data.get(active_provider, data.get("gemini", data.get("ollama", "")))
|
||||
return str(data)
|
||||
|
||||
async def generate_raw_response(
|
||||
|
|
@ -76,35 +85,43 @@ class LLMService:
|
|||
provider: Optional[str] = None,
|
||||
model_override: Optional[str] = None
|
||||
) -> str:
|
||||
"""Einstiegspunkt mit Priority-Handling."""
|
||||
"""Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung."""
|
||||
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||
|
||||
if priority == "background":
|
||||
async with LLMService._background_semaphore:
|
||||
return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
|
||||
|
||||
return await self._dispatch(target_provider, prompt, system, force_json, max_retries, base_delay, model_override)
|
||||
|
||||
async def _dispatch(self, provider, prompt, system, force_json, max_retries, base_delay, model_override):
|
||||
"""Routet die Anfrage an den physikalischen API-Provider."""
|
||||
try:
|
||||
if provider == "openrouter" and self.openrouter_client:
|
||||
return await self._execute_openrouter(prompt, system, force_json, model_override)
|
||||
|
||||
if provider == "gemini" and self.google_client:
|
||||
return await self._execute_google(prompt, system, force_json, model_override)
|
||||
|
||||
# Default/Fallback zu Ollama
|
||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||
|
||||
except Exception as e:
|
||||
# QUOTEN-SCHUTZ: Wenn Cloud (OpenRouter/Gemini) fehlschlägt,
|
||||
# gehen wir IMMER zu Ollama, niemals von OpenRouter zu Gemini.
|
||||
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
|
||||
logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to Ollama.")
|
||||
logger.warning(f"🔄 Provider {provider} failed: {e}. Falling back to LOCAL OLLAMA to protect cloud quotas.")
|
||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||
raise e
|
||||
|
||||
async def _execute_google(self, prompt, system, force_json, model_override):
|
||||
"""Native Google SDK Integration."""
|
||||
"""Native Google SDK Integration (Gemini)."""
|
||||
model = model_override or self.settings.GEMINI_MODEL
|
||||
config = types.GenerateContentConfig(
|
||||
system_instruction=system,
|
||||
response_mime_type="application/json" if force_json else "text/plain"
|
||||
)
|
||||
# Synchroner SDK-Call in Thread auslagern
|
||||
# SDK Call in Thread auslagern, da die Google API blocking sein kann
|
||||
response = await asyncio.to_thread(
|
||||
self.google_client.models.generate_content,
|
||||
model=model, contents=prompt, config=config
|
||||
|
|
@ -112,10 +129,11 @@ class LLMService:
|
|||
return response.text.strip()
|
||||
|
||||
async def _execute_openrouter(self, prompt, system, force_json, model_override):
|
||||
"""OpenRouter (OpenAI-kompatibel)."""
|
||||
model = model_override or self.settings.OPENROUTER_MODEL
|
||||
"""OpenRouter API Integration (OpenAI-kompatibel)."""
|
||||
model = model_override or getattr(self.settings, "OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
|
||||
messages = []
|
||||
if system: messages.append({"role": "system", "content": system})
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
response = await self.openrouter_client.chat.completions.create(
|
||||
|
|
@ -126,10 +144,15 @@ class LLMService:
|
|||
return response.choices[0].message.content.strip()
|
||||
|
||||
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
|
||||
"""Ollama mit exponentiellem Backoff."""
|
||||
"""Lokaler Ollama Call mit exponentiellem Backoff."""
|
||||
payload = {
|
||||
"model": self.settings.LLM_MODEL, "prompt": prompt, "stream": False,
|
||||
"options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
|
||||
"model": self.settings.LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1 if force_json else 0.7,
|
||||
"num_ctx": 8192
|
||||
}
|
||||
}
|
||||
if force_json: payload["format"] = "json"
|
||||
if system: payload["system"] = system
|
||||
|
|
@ -142,18 +165,28 @@ class LLMService:
|
|||
return res.json().get("response", "").strip()
|
||||
except Exception as e:
|
||||
attempt += 1
|
||||
if attempt > max_retries: raise e
|
||||
wait = base_delay * (2 ** (attempt - 1))
|
||||
logger.warning(f"⚠️ Ollama retry {attempt} in {wait}s...")
|
||||
await asyncio.sleep(wait)
|
||||
if attempt > max_retries:
|
||||
logger.error(f"Ollama Error after {attempt} retries: {e}")
|
||||
raise e
|
||||
wait_time = base_delay * (2 ** (attempt - 1))
|
||||
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
async def generate_rag_response(self, query: str, context_str: str) -> str:
|
||||
"""Vollständiger RAG-Wrapper."""
|
||||
"""Vollständiges RAG Chat-Interface."""
|
||||
provider = self.settings.MINDNET_LLM_PROVIDER
|
||||
system = self.get_prompt("system_prompt", provider)
|
||||
template = self.get_prompt("rag_template", provider)
|
||||
final_prompt = template.format(context_str=context_str, query=query)
|
||||
return await self.generate_raw_response(final_prompt, system=system, priority="realtime")
|
||||
system_prompt = self.get_prompt("system_prompt", provider)
|
||||
rag_template = self.get_prompt("rag_template", provider)
|
||||
|
||||
final_prompt = rag_template.format(context_str=context_str, query=query)
|
||||
|
||||
return await self.generate_raw_response(
|
||||
final_prompt,
|
||||
system=system_prompt,
|
||||
priority="realtime"
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
await self.ollama_client.aclose()
|
||||
"""Schließt die HTTP-Verbindungen."""
|
||||
if self.ollama_client:
|
||||
await self.ollama_client.aclose()
|
||||
|
|
@ -32,6 +32,8 @@ rag_template:
|
|||
Analysiere diesen Kontext meines digitalen Zwillings:
|
||||
{context_str}
|
||||
Beantworte die Anfrage detailliert und prüfe auf Widersprüche: {query}
|
||||
openrouter: "Kontext-Analyse für Gemma/Llama: {context_str}\n\nAnfrage: {query}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 2. DECISION: Strategie & Abwägung (Intent: DECISION)
|
||||
|
|
@ -59,7 +61,7 @@ decision_template:
|
|||
gemini: |
|
||||
Agierte als Senior Strategy Consultant. Nutze den Kontext {context_str}, um die Frage {query}
|
||||
tiefgreifend gegen meine langfristigen Ziele abzuwägen.
|
||||
|
||||
openrouter: "Strategischer Check (OpenRouter): {query}\n\nReferenzdaten: {context_str}"
|
||||
# ---------------------------------------------------------
|
||||
# 3. EMPATHY: Der Spiegel / "Ich"-Modus (Intent: EMPATHY)
|
||||
# ---------------------------------------------------------
|
||||
|
|
@ -179,3 +181,7 @@ edge_allocation_template:
|
|||
Finde auch implizite Verbindungen.
|
||||
JSON: [{"to": "X", "kind": "Y", "reason": "Z"}].
|
||||
TEXT: {text}
|
||||
openrouter: |
|
||||
Analysiere den Text für den Graphen. Identifiziere semantische Verbindungen.
|
||||
Output JSON: [{"to": "X", "kind": "Y"}].
|
||||
Text: {text}
|
||||
Loading…
Reference in New Issue
Block a user