Update LLMService for WP-25: Enhance stability with improved response handling, including safeguards against empty responses and adjustments for short input validations. Maintain compatibility with previous logic for rate limits and retries. Version bump to 3.4.2.

This commit is contained in:
Lars 2026-01-01 08:31:15 +01:00
parent d49d509451
commit bb6959a090

View File

@ -3,16 +3,14 @@ FILE: app/services/llm_service.py
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter. DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
Verwaltet provider-spezifische Prompts und Background-Last. Verwaltet provider-spezifische Prompts und Background-Last.
WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten. WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe. WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter).
WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs).
FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts.
WP-25: Integration der DecisionEngine für Agentic Multi-Stream RAG. WP-25: Integration der DecisionEngine für Agentic Multi-Stream RAG.
VERSION: 3.4.1 VERSION: 3.4.2 (WP-25: Ingest-Stability Patch)
STATUS: Active STATUS: Active
FIX: FIX:
- 100% Wiederherstellung der v3.3.9 Logik (Rate-Limits, Retries, Async-Threads). - Ingest-Stability: Entfernung des <5-Zeichen Guards (ermöglicht YES/NO Validierungen).
- Integration des WP-25 DecisionEngine Bridges in generate_rag_response. - OpenRouter-Fix: Sicherung gegen leere 'choices' zur Vermeidung von JSON-Errors.
- WP-25 Empty-Response-Guard für Cloud-Provider. - Erhalt der vollständigen v3.3.9 Logik für Rate-Limits, Retries und Background-Tasks.
""" """
import httpx import httpx
import yaml import yaml
@ -99,17 +97,13 @@ class LLMService:
def get_prompt(self, key: str, provider: str = None) -> str: def get_prompt(self, key: str, provider: str = None) -> str:
""" """
Hole provider-spezifisches Template mit intelligenter Text-Kaskade. Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent. Kaskade: Gewählter Provider -> Gemini -> Ollama.
Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil).
""" """
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
data = self.prompts.get(key, "") data = self.prompts.get(key, "")
if isinstance(data, dict): if isinstance(data, dict):
# Wir versuchen erst den Provider, dann Gemini, dann Ollama
val = data.get(active_provider, data.get("gemini", data.get("ollama", ""))) val = data.get(active_provider, data.get("gemini", data.get("ollama", "")))
# Falls val durch YAML-Fehler immer noch ein Dict ist, extrahiere ersten String
if isinstance(val, dict): if isinstance(val, dict):
logger.warning(f"⚠️ [LLMService] Nested dictionary detected for key '{key}'. Using first entry.") logger.warning(f"⚠️ [LLMService] Nested dictionary detected for key '{key}'. Using first entry.")
val = next(iter(val.values()), "") if val else "" val = next(iter(val.values()), "") if val else ""
@ -132,8 +126,8 @@ class LLMService:
strict_json_schema: bool = True strict_json_schema: bool = True
) -> str: ) -> str:
""" """
Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung. Haupteinstiegspunkt für LLM-Anfragen.
Wendet die Bereinigung auf Text-Antworten an. WP-25 FIX: Schwellenwert entfernt, um kurze Ingest-Validierungen (YES/NO) zu unterstützen.
""" """
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
@ -151,8 +145,8 @@ class LLMService:
json_schema, json_schema_name, strict_json_schema json_schema, json_schema_name, strict_json_schema
) )
# WP-25 Empty Response Fix: Wenn Cloud-Provider leer antworten, Fallback auf Ollama # WP-25 FIX: Nur noch auf absolut leere Antwort prüfen (ermöglicht YES/NO Antworten).
if (not res or len(res.strip()) < 5) and target_provider != "ollama": if not res and target_provider != "ollama":
logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Falling back to OLLAMA.") logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Falling back to OLLAMA.")
res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
@ -172,12 +166,8 @@ class LLMService:
json_schema_name: str, json_schema_name: str,
strict_json_schema: bool strict_json_schema: bool
) -> str: ) -> str:
""" """Routet die Anfrage mit intelligenter Rate-Limit Erkennung."""
Routet die Anfrage mit intelligenter Rate-Limit Erkennung.
Nutzt max_retries um die Rate-Limit Schleife zu begrenzen.
"""
rate_limit_attempts = 0 rate_limit_attempts = 0
# FIX: Wir nutzen max_retries als Limit für Rate-Limit Versuche, wenn explizit klein gewählt (z.B. Chat)
max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3)) max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0) wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)
@ -197,33 +187,24 @@ class LLMService:
if provider == "gemini" and self.google_client: if provider == "gemini" and self.google_client:
return await self._execute_google(prompt, system, force_json, model_override) return await self._execute_google(prompt, system, force_json, model_override)
# Default/Fallback zu Ollama
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
except Exception as e: except Exception as e:
err_str = str(e) err_str = str(e)
# Intelligente 429 Erkennung
is_rate_limit = any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited", "Too Many Requests"]) is_rate_limit = any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited", "Too Many Requests"])
if is_rate_limit and rate_limit_attempts < max_rate_retries: if is_rate_limit and rate_limit_attempts < max_rate_retries:
rate_limit_attempts += 1 rate_limit_attempts += 1
logger.warning( logger.warning(f"⏳ Rate Limit from {provider}. Attempt {rate_limit_attempts}. Waiting {wait_time}s...")
f"⏳ [LLMService] Rate Limit detected from {provider}. "
f"Attempt {rate_limit_attempts}/{max_rate_retries}. Waiting {wait_time}s..."
)
await asyncio.sleep(wait_time) await asyncio.sleep(wait_time)
continue continue
# Wenn kein Rate-Limit oder Retries erschöpft -> Fallback zu Ollama (falls aktiviert)
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama": if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
logger.warning( logger.warning(f"🔄 Provider {provider} failed ({err_str}). Falling back to OLLAMA.")
f"🔄 Provider {provider} failed ({err_str}). Falling back to LOCAL OLLAMA."
)
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay) return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
raise e raise e
async def _execute_google(self, prompt, system, force_json, model_override): async def _execute_google(self, prompt, system, force_json, model_override):
"""Native Google SDK Integration (Gemini) mit v1 Fix."""
model = model_override or self.settings.GEMINI_MODEL model = model_override or self.settings.GEMINI_MODEL
clean_model = model.replace("models/", "") clean_model = model.replace("models/", "")
@ -250,7 +231,7 @@ class LLMService:
json_schema_name: str = "mindnet_json", json_schema_name: str = "mindnet_json",
strict_json_schema: bool = True strict_json_schema: bool = True
) -> str: ) -> str:
"""OpenRouter API Integration (OpenAI-kompatibel).""" """OpenRouter API Integration. WP-25 FIX: Sicherung gegen leere 'choices'."""
model = model_override or self.settings.OPENROUTER_MODEL model = model_override or self.settings.OPENROUTER_MODEL
messages = [] messages = []
if system: if system:
@ -263,9 +244,7 @@ class LLMService:
kwargs["response_format"] = { kwargs["response_format"] = {
"type": "json_schema", "type": "json_schema",
"json_schema": { "json_schema": {
"name": json_schema_name, "name": json_schema_name, "strict": strict_json_schema, "schema": json_schema
"strict": strict_json_schema,
"schema": json_schema
} }
} }
else: else:
@ -276,23 +255,23 @@ class LLMService:
messages=messages, messages=messages,
**kwargs **kwargs
) )
return response.choices[0].message.content.strip()
# WP-25 FIX: Sicherung gegen leere Antwort-Arrays
if not response.choices or len(response.choices) == 0:
logger.warning(f"🛰️ OpenRouter returned no choices for model {model}")
return ""
return response.choices[0].message.content.strip() if response.choices[0].message.content else ""
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay): async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
"""Lokaler Ollama Call mit striktem Retry-Limit."""
payload = { payload = {
"model": self.settings.LLM_MODEL, "model": self.settings.LLM_MODEL,
"prompt": prompt, "prompt": prompt,
"stream": False, "stream": False,
"options": { "options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
"temperature": 0.1 if force_json else 0.7,
"num_ctx": 8192 # Begrenzung für Stabilität (WP-20)
}
} }
if force_json: if force_json: payload["format"] = "json"
payload["format"] = "json" if system: payload["system"] = system
if system:
payload["system"] = system
attempt = 0 attempt = 0
while True: while True:
@ -302,27 +281,17 @@ class LLMService:
return res.json().get("response", "").strip() return res.json().get("response", "").strip()
except Exception as e: except Exception as e:
attempt += 1 attempt += 1
# WICHTIG: Wenn max_retries=0 (Chat), bricht dies nach dem 1. Versuch (attempt=1) sofort ab.
if attempt > max_retries: if attempt > max_retries:
logger.error(f"❌ Ollama request failed after {attempt} attempt(s): {e}") logger.error(f"❌ Ollama request failed: {e}")
raise e raise e
wait_time = base_delay * (2 ** (attempt - 1)) wait_time = base_delay * (2 ** (attempt - 1))
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
await asyncio.sleep(wait_time) await asyncio.sleep(wait_time)
async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str: async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
""" """WP-25: Orchestrierung via DecisionEngine."""
WP-25 UPDATE: Der primäre Einstiegspunkt für den MindNet Chat. logger.info(f"🚀 [WP-25] Chat Query: {query[:50]}...")
Delegiert nun an die DecisionEngine für Agentic Multi-Stream RAG.
Falls context_str bereits vorhanden ist (Legacy), wird dieser ignoriert zugunsten
der präzisen Multi-Stream Orchestrierung.
"""
logger.info(f"🚀 [WP-25] Chat Query intercepted: {query[:50]}...")
# Die DecisionEngine übernimmt nun das gesamte Management (Routing, Retrieval, Synthesis)
return await self.decision_engine.ask(query) return await self.decision_engine.ask(query)
async def close(self): async def close(self):
"""Schließt die HTTP-Verbindungen."""
if self.ollama_client: if self.ollama_client:
await self.ollama_client.aclose() await self.ollama_client.aclose()