Update LLMService for WP-25: Enhance stability with improved response handling, including safeguards against empty responses and adjustments for short input validations. Maintain compatibility with previous logic for rate limits and retries. Version bump to 3.4.2.
This commit is contained in:
parent
d49d509451
commit
bb6959a090
|
|
@ -3,16 +3,14 @@ FILE: app/services/llm_service.py
|
|||
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
|
||||
Verwaltet provider-spezifische Prompts und Background-Last.
|
||||
WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
|
||||
WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe.
|
||||
WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs).
|
||||
FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts.
|
||||
WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter).
|
||||
WP-25: Integration der DecisionEngine für Agentic Multi-Stream RAG.
|
||||
VERSION: 3.4.1
|
||||
VERSION: 3.4.2 (WP-25: Ingest-Stability Patch)
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- 100% Wiederherstellung der v3.3.9 Logik (Rate-Limits, Retries, Async-Threads).
|
||||
- Integration des WP-25 DecisionEngine Bridges in generate_rag_response.
|
||||
- WP-25 Empty-Response-Guard für Cloud-Provider.
|
||||
- Ingest-Stability: Entfernung des <5-Zeichen Guards (ermöglicht YES/NO Validierungen).
|
||||
- OpenRouter-Fix: Sicherung gegen leere 'choices' zur Vermeidung von JSON-Errors.
|
||||
- Erhalt der vollständigen v3.3.9 Logik für Rate-Limits, Retries und Background-Tasks.
|
||||
"""
|
||||
import httpx
|
||||
import yaml
|
||||
|
|
@ -99,17 +97,13 @@ class LLMService:
|
|||
def get_prompt(self, key: str, provider: str = None) -> str:
|
||||
"""
|
||||
Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
|
||||
HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent.
|
||||
Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil).
|
||||
Kaskade: Gewählter Provider -> Gemini -> Ollama.
|
||||
"""
|
||||
active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||
data = self.prompts.get(key, "")
|
||||
|
||||
if isinstance(data, dict):
|
||||
# Wir versuchen erst den Provider, dann Gemini, dann Ollama
|
||||
val = data.get(active_provider, data.get("gemini", data.get("ollama", "")))
|
||||
|
||||
# Falls val durch YAML-Fehler immer noch ein Dict ist, extrahiere ersten String
|
||||
if isinstance(val, dict):
|
||||
logger.warning(f"⚠️ [LLMService] Nested dictionary detected for key '{key}'. Using first entry.")
|
||||
val = next(iter(val.values()), "") if val else ""
|
||||
|
|
@ -132,8 +126,8 @@ class LLMService:
|
|||
strict_json_schema: bool = True
|
||||
) -> str:
|
||||
"""
|
||||
Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung.
|
||||
Wendet die Bereinigung auf Text-Antworten an.
|
||||
Haupteinstiegspunkt für LLM-Anfragen.
|
||||
WP-25 FIX: Schwellenwert entfernt, um kurze Ingest-Validierungen (YES/NO) zu unterstützen.
|
||||
"""
|
||||
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||
|
||||
|
|
@ -151,8 +145,8 @@ class LLMService:
|
|||
json_schema, json_schema_name, strict_json_schema
|
||||
)
|
||||
|
||||
# WP-25 Empty Response Fix: Wenn Cloud-Provider leer antworten, Fallback auf Ollama
|
||||
if (not res or len(res.strip()) < 5) and target_provider != "ollama":
|
||||
# WP-25 FIX: Nur noch auf absolut leere Antwort prüfen (ermöglicht YES/NO Antworten).
|
||||
if not res and target_provider != "ollama":
|
||||
logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Falling back to OLLAMA.")
|
||||
res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||
|
||||
|
|
@ -172,12 +166,8 @@ class LLMService:
|
|||
json_schema_name: str,
|
||||
strict_json_schema: bool
|
||||
) -> str:
|
||||
"""
|
||||
Routet die Anfrage mit intelligenter Rate-Limit Erkennung.
|
||||
Nutzt max_retries um die Rate-Limit Schleife zu begrenzen.
|
||||
"""
|
||||
"""Routet die Anfrage mit intelligenter Rate-Limit Erkennung."""
|
||||
rate_limit_attempts = 0
|
||||
# FIX: Wir nutzen max_retries als Limit für Rate-Limit Versuche, wenn explizit klein gewählt (z.B. Chat)
|
||||
max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
|
||||
wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)
|
||||
|
||||
|
|
@ -197,33 +187,24 @@ class LLMService:
|
|||
if provider == "gemini" and self.google_client:
|
||||
return await self._execute_google(prompt, system, force_json, model_override)
|
||||
|
||||
# Default/Fallback zu Ollama
|
||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||
|
||||
except Exception as e:
|
||||
err_str = str(e)
|
||||
# Intelligente 429 Erkennung
|
||||
is_rate_limit = any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited", "Too Many Requests"])
|
||||
|
||||
if is_rate_limit and rate_limit_attempts < max_rate_retries:
|
||||
rate_limit_attempts += 1
|
||||
logger.warning(
|
||||
f"⏳ [LLMService] Rate Limit detected from {provider}. "
|
||||
f"Attempt {rate_limit_attempts}/{max_rate_retries}. Waiting {wait_time}s..."
|
||||
)
|
||||
logger.warning(f"⏳ Rate Limit from {provider}. Attempt {rate_limit_attempts}. Waiting {wait_time}s...")
|
||||
await asyncio.sleep(wait_time)
|
||||
continue
|
||||
|
||||
# Wenn kein Rate-Limit oder Retries erschöpft -> Fallback zu Ollama (falls aktiviert)
|
||||
if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
|
||||
logger.warning(
|
||||
f"🔄 Provider {provider} failed ({err_str}). Falling back to LOCAL OLLAMA."
|
||||
)
|
||||
logger.warning(f"🔄 Provider {provider} failed ({err_str}). Falling back to OLLAMA.")
|
||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
|
||||
raise e
|
||||
|
||||
async def _execute_google(self, prompt, system, force_json, model_override):
|
||||
"""Native Google SDK Integration (Gemini) mit v1 Fix."""
|
||||
model = model_override or self.settings.GEMINI_MODEL
|
||||
clean_model = model.replace("models/", "")
|
||||
|
||||
|
|
@ -250,7 +231,7 @@ class LLMService:
|
|||
json_schema_name: str = "mindnet_json",
|
||||
strict_json_schema: bool = True
|
||||
) -> str:
|
||||
"""OpenRouter API Integration (OpenAI-kompatibel)."""
|
||||
"""OpenRouter API Integration. WP-25 FIX: Sicherung gegen leere 'choices'."""
|
||||
model = model_override or self.settings.OPENROUTER_MODEL
|
||||
messages = []
|
||||
if system:
|
||||
|
|
@ -263,9 +244,7 @@ class LLMService:
|
|||
kwargs["response_format"] = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": json_schema_name,
|
||||
"strict": strict_json_schema,
|
||||
"schema": json_schema
|
||||
"name": json_schema_name, "strict": strict_json_schema, "schema": json_schema
|
||||
}
|
||||
}
|
||||
else:
|
||||
|
|
@ -276,23 +255,23 @@ class LLMService:
|
|||
messages=messages,
|
||||
**kwargs
|
||||
)
|
||||
return response.choices[0].message.content.strip()
|
||||
|
||||
# WP-25 FIX: Sicherung gegen leere Antwort-Arrays
|
||||
if not response.choices or len(response.choices) == 0:
|
||||
logger.warning(f"🛰️ OpenRouter returned no choices for model {model}")
|
||||
return ""
|
||||
|
||||
return response.choices[0].message.content.strip() if response.choices[0].message.content else ""
|
||||
|
||||
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
|
||||
"""Lokaler Ollama Call mit striktem Retry-Limit."""
|
||||
payload = {
|
||||
"model": self.settings.LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1 if force_json else 0.7,
|
||||
"num_ctx": 8192 # Begrenzung für Stabilität (WP-20)
|
||||
"options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
|
||||
}
|
||||
}
|
||||
if force_json:
|
||||
payload["format"] = "json"
|
||||
if system:
|
||||
payload["system"] = system
|
||||
if force_json: payload["format"] = "json"
|
||||
if system: payload["system"] = system
|
||||
|
||||
attempt = 0
|
||||
while True:
|
||||
|
|
@ -302,27 +281,17 @@ class LLMService:
|
|||
return res.json().get("response", "").strip()
|
||||
except Exception as e:
|
||||
attempt += 1
|
||||
# WICHTIG: Wenn max_retries=0 (Chat), bricht dies nach dem 1. Versuch (attempt=1) sofort ab.
|
||||
if attempt > max_retries:
|
||||
logger.error(f"❌ Ollama request failed after {attempt} attempt(s): {e}")
|
||||
logger.error(f"❌ Ollama request failed: {e}")
|
||||
raise e
|
||||
|
||||
wait_time = base_delay * (2 ** (attempt - 1))
|
||||
logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
|
||||
"""
|
||||
WP-25 UPDATE: Der primäre Einstiegspunkt für den MindNet Chat.
|
||||
Delegiert nun an die DecisionEngine für Agentic Multi-Stream RAG.
|
||||
Falls context_str bereits vorhanden ist (Legacy), wird dieser ignoriert zugunsten
|
||||
der präzisen Multi-Stream Orchestrierung.
|
||||
"""
|
||||
logger.info(f"🚀 [WP-25] Chat Query intercepted: {query[:50]}...")
|
||||
# Die DecisionEngine übernimmt nun das gesamte Management (Routing, Retrieval, Synthesis)
|
||||
"""WP-25: Orchestrierung via DecisionEngine."""
|
||||
logger.info(f"🚀 [WP-25] Chat Query: {query[:50]}...")
|
||||
return await self.decision_engine.ask(query)
|
||||
|
||||
async def close(self):
|
||||
"""Schließt die HTTP-Verbindungen."""
|
||||
if self.ollama_client:
|
||||
await self.ollama_client.aclose()
|
||||
Loading…
Reference in New Issue
Block a user