WP25 #19
|
|
@ -3,16 +3,14 @@ FILE: app/services/llm_service.py
|
||||||
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
|
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
|
||||||
Verwaltet provider-spezifische Prompts und Background-Last.
|
Verwaltet provider-spezifische Prompts und Background-Last.
|
||||||
WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
|
WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
|
||||||
WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe.
|
WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter).
|
||||||
WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs).
|
|
||||||
FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts.
|
|
||||||
WP-25: Integration der DecisionEngine für Agentic Multi-Stream RAG.
|
WP-25: Integration der DecisionEngine für Agentic Multi-Stream RAG.
|
||||||
VERSION: 3.4.1
|
VERSION: 3.4.2 (WP-25: Ingest-Stability Patch)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
FIX:
|
FIX:
|
||||||
- 100% Wiederherstellung der v3.3.9 Logik (Rate-Limits, Retries, Async-Threads).
|
- Ingest-Stability: Entfernung des <5-Zeichen Guards (ermöglicht YES/NO Validierungen).
|
||||||
- Integration des WP-25 DecisionEngine Bridges in generate_rag_response.
|
- OpenRouter-Fix: Sicherung gegen leere 'choices' zur Vermeidung von JSON-Errors.
|
||||||
- WP-25 Empty-Response-Guard für Cloud-Provider.
|
- Erhalt der vollständigen v3.3.9 Logik für Rate-Limits, Retries und Background-Tasks.
|
||||||
"""
|
"""
|
||||||
import httpx
|
import httpx
|
||||||
import yaml
|
import yaml
|
||||||
|
|
@ -99,17 +97,13 @@ class LLMService:
|
||||||
def get_prompt(self, key: str, provider: str = None) -> str:
|
def get_prompt(self, key: str, provider: str = None) -> str:
|
||||||
"""
|
"""
|
||||||
Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
|
Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
|
||||||
HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent.
|
Kaskade: Gewählter Provider -> Gemini -> Ollama.
|
||||||
Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil).
|
|
||||||
"""
|
"""
|
||||||
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||||
data = self.prompts.get(key, "")
|
data = self.prompts.get(key, "")
|
||||||
|
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
# Wir versuchen erst den Provider, dann Gemini, dann Ollama
|
|
||||||
val = data.get(active_provider, data.get("gemini", data.get("ollama", "")))
|
val = data.get(active_provider, data.get("gemini", data.get("ollama", "")))
|
||||||
|
|
||||||
# Falls val durch YAML-Fehler immer noch ein Dict ist, extrahiere ersten String
|
|
||||||
if isinstance(val, dict):
|
if isinstance(val, dict):
|
||||||
logger.warning(f"⚠️ [LLMService] Nested dictionary detected for key '{key}'. Using first entry.")
|
logger.warning(f"⚠️ [LLMService] Nested dictionary detected for key '{key}'. Using first entry.")
|
||||||
val = next(iter(val.values()), "") if val else ""
|
val = next(iter(val.values()), "") if val else ""
|
||||||
|
|
@ -132,8 +126,8 @@ class LLMService:
|
||||||
strict_json_schema: bool = True
|
strict_json_schema: bool = True
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung.
|
Haupteinstiegspunkt für LLM-Anfragen.
|
||||||
Wendet die Bereinigung auf Text-Antworten an.
|
WP-25 FIX: Schwellenwert entfernt, um kurze Ingest-Validierungen (YES/NO) zu unterstützen.
|
||||||
"""
|
"""
|
||||||
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||||
|
|
||||||
|
|
@ -151,8 +145,8 @@ class LLMService:
|
||||||
json_schema, json_schema_name, strict_json_schema
|
json_schema, json_schema_name, strict_json_schema
|
||||||
)
|
)
|
||||||
|
|
||||||
# WP-25 Empty Response Fix: Wenn Cloud-Provider leer antworten, Fallback auf Ollama
|
# WP-25 FIX: Nur noch auf absolut leere Antwort prüfen (ermöglicht YES/NO Antworten).
|
||||||
if (not res or len(res.strip()) < 5) and target_provider != "ollama":
|
if not res and target_provider != "ollama":
|
||||||
logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Falling back to OLLAMA.")
|
logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Falling back to OLLAMA.")
|
||||||
res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||||
|
|
||||||
|
|
@ -172,12 +166,8 @@ class LLMService:
|
||||||
json_schema_name: str,
|
json_schema_name: str,
|
||||||
strict_json_schema: bool
|
strict_json_schema: bool
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""Routet die Anfrage mit intelligenter Rate-Limit Erkennung."""
|
||||||
Routet die Anfrage mit intelligenter Rate-Limit Erkennung.
|
|
||||||
Nutzt max_retries um die Rate-Limit Schleife zu begrenzen.
|
|
||||||
"""
|
|
||||||
rate_limit_attempts = 0
|
rate_limit_attempts = 0
|
||||||
# FIX: Wir nutzen max_retries als Limit für Rate-Limit Versuche, wenn explizit klein gewählt (z.B. Chat)
|
|
||||||
max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
|
max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
|
||||||
wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)
|
wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)
|
||||||
|
|
||||||
|
|
@ -197,33 +187,24 @@ class LLMService:
|
||||||
if provider == "gemini" and self.google_client:
|
if provider == "gemini" and self.google_client:
|
||||||
return await self._execute_google(prompt, system, force_json, model_override)
|
return await self._execute_google(prompt, system, force_json, model_override)
|
||||||
|
|
||||||
# Default/Fallback zu Ollama
|
|
||||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
err_str = str(e)
|
err_str = str(e)
|
||||||
# Intelligente 429 Erkennung
|
|
||||||
is_rate_limit = any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited", "Too Many Requests"])
|
is_rate_limit = any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited", "Too Many Requests"])
|
||||||
|
|
||||||
if is_rate_limit and rate_limit_attempts < max_rate_retries:
|
if is_rate_limit and rate_limit_attempts < max_rate_retries:
|
||||||
rate_limit_attempts += 1
|
rate_limit_attempts += 1
|
||||||
logger.warning(
|
logger.warning(f"⏳ Rate Limit from {provider}. Attempt {rate_limit_attempts}. Waiting {wait_time}s...")
|
||||||
f"⏳ [LLMService] Rate Limit detected from {provider}. "
|
|
||||||
f"Attempt {rate_limit_attempts}/{max_rate_retries}. Waiting {wait_time}s..."
|
|
||||||
)
|
|
||||||
await asyncio.sleep(wait_time)
|
await asyncio.sleep(wait_time)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Wenn kein Rate-Limit oder Retries erschöpft -> Fallback zu Ollama (falls aktiviert)
|
|
||||||
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
|
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
|
||||||
logger.warning(
|
logger.warning(f"🔄 Provider {provider} failed ({err_str}). Falling back to OLLAMA.")
|
||||||
f"🔄 Provider {provider} failed ({err_str}). Falling back to LOCAL OLLAMA."
|
|
||||||
)
|
|
||||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
async def _execute_google(self, prompt, system, force_json, model_override):
|
async def _execute_google(self, prompt, system, force_json, model_override):
|
||||||
"""Native Google SDK Integration (Gemini) mit v1 Fix."""
|
|
||||||
model = model_override or self.settings.GEMINI_MODEL
|
model = model_override or self.settings.GEMINI_MODEL
|
||||||
clean_model = model.replace("models/", "")
|
clean_model = model.replace("models/", "")
|
||||||
|
|
||||||
|
|
@ -250,7 +231,7 @@ class LLMService:
|
||||||
json_schema_name: str = "mindnet_json",
|
json_schema_name: str = "mindnet_json",
|
||||||
strict_json_schema: bool = True
|
strict_json_schema: bool = True
|
||||||
) -> str:
|
) -> str:
|
||||||
"""OpenRouter API Integration (OpenAI-kompatibel)."""
|
"""OpenRouter API Integration. WP-25 FIX: Sicherung gegen leere 'choices'."""
|
||||||
model = model_override or self.settings.OPENROUTER_MODEL
|
model = model_override or self.settings.OPENROUTER_MODEL
|
||||||
messages = []
|
messages = []
|
||||||
if system:
|
if system:
|
||||||
|
|
@ -263,9 +244,7 @@ class LLMService:
|
||||||
kwargs["response_format"] = {
|
kwargs["response_format"] = {
|
||||||
"type": "json_schema",
|
"type": "json_schema",
|
||||||
"json_schema": {
|
"json_schema": {
|
||||||
"name": json_schema_name,
|
"name": json_schema_name, "strict": strict_json_schema, "schema": json_schema
|
||||||
"strict": strict_json_schema,
|
|
||||||
"schema": json_schema
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
|
|
@ -276,23 +255,23 @@ class LLMService:
|
||||||
messages=messages,
|
messages=messages,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content.strip()
|
|
||||||
|
# WP-25 FIX: Sicherung gegen leere Antwort-Arrays
|
||||||
|
if not response.choices or len(response.choices) == 0:
|
||||||
|
logger.warning(f"🛰️ OpenRouter returned no choices for model {model}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return response.choices[0].message.content.strip() if response.choices[0].message.content else ""
|
||||||
|
|
||||||
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
|
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
|
||||||
"""Lokaler Ollama Call mit striktem Retry-Limit."""
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": self.settings.LLM_MODEL,
|
"model": self.settings.LLM_MODEL,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {
|
"options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
|
||||||
"temperature": 0.1 if force_json else 0.7,
|
|
||||||
"num_ctx": 8192 # Begrenzung für Stabilität (WP-20)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if force_json:
|
if force_json: payload["format"] = "json"
|
||||||
payload["format"] = "json"
|
if system: payload["system"] = system
|
||||||
if system:
|
|
||||||
payload["system"] = system
|
|
||||||
|
|
||||||
attempt = 0
|
attempt = 0
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -302,27 +281,17 @@ class LLMService:
|
||||||
return res.json().get("response", "").strip()
|
return res.json().get("response", "").strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
attempt += 1
|
attempt += 1
|
||||||
# WICHTIG: Wenn max_retries=0 (Chat), bricht dies nach dem 1. Versuch (attempt=1) sofort ab.
|
|
||||||
if attempt > max_retries:
|
if attempt > max_retries:
|
||||||
logger.error(f"❌ Ollama request failed after {attempt} attempt(s): {e}")
|
logger.error(f"❌ Ollama request failed: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
wait_time = base_delay * (2 ** (attempt - 1))
|
wait_time = base_delay * (2 ** (attempt - 1))
|
||||||
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
|
|
||||||
await asyncio.sleep(wait_time)
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
|
async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
|
||||||
"""
|
"""WP-25: Orchestrierung via DecisionEngine."""
|
||||||
WP-25 UPDATE: Der primäre Einstiegspunkt für den MindNet Chat.
|
logger.info(f"🚀 [WP-25] Chat Query: {query[:50]}...")
|
||||||
Delegiert nun an die DecisionEngine für Agentic Multi-Stream RAG.
|
|
||||||
Falls context_str bereits vorhanden ist (Legacy), wird dieser ignoriert zugunsten
|
|
||||||
der präzisen Multi-Stream Orchestrierung.
|
|
||||||
"""
|
|
||||||
logger.info(f"🚀 [WP-25] Chat Query intercepted: {query[:50]}...")
|
|
||||||
# Die DecisionEngine übernimmt nun das gesamte Management (Routing, Retrieval, Synthesis)
|
|
||||||
return await self.decision_engine.ask(query)
|
return await self.decision_engine.ask(query)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Schließt die HTTP-Verbindungen."""
|
|
||||||
if self.ollama_client:
|
if self.ollama_client:
|
||||||
await self.ollama_client.aclose()
|
await self.ollama_client.aclose()
|
||||||
Loading…
Reference in New Issue
Block a user