Update LLMService for WP-25: Enhance stability with improved response handling, including safeguards against empty responses and adjustments for short input validations. Maintain compatibility with previous logic for rate limits and retries. Version bump to 3.4.2.

2026-01-01 08:31:15 +01:00 · 2026-01-01 08:31:15 +01:00 · bb6959a090
commit bb6959a090
parent d49d509451
1 changed files with 28 additions and 59 deletions
--- a/app/services/llm_service.py
+++ b/app/services/llm_service.py
@ -3,16 +3,14 @@ FILE: app/services/llm_service.py
 DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
             Verwaltet provider-spezifische Prompts und Background-Last.
             WP-20: Optimiertes Fallback-Management zum Schutz von Cloud-Quoten.
-             WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe.
+             WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter).
             WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs).
             FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts.
             WP-25: Integration der DecisionEngine für Agentic Multi-Stream RAG.
-VERSION: 3.4.1
+VERSION: 3.4.2 (WP-25: Ingest-Stability Patch)
 STATUS: Active
 FIX: 
- 100% Wiederherstellung der v3.3.9 Logik (Rate-Limits, Retries, Async-Threads).
+- Ingest-Stability: Entfernung des <5-Zeichen Guards (ermöglicht YES/NO Validierungen).
- Integration des WP-25 DecisionEngine Bridges in generate_rag_response.
+- OpenRouter-Fix: Sicherung gegen leere 'choices' zur Vermeidung von JSON-Errors.
- WP-25 Empty-Response-Guard für Cloud-Provider.
+- Erhalt der vollständigen v3.3.9 Logik für Rate-Limits, Retries und Background-Tasks.
 """
 import httpx
 import yaml
@ -99,17 +97,13 @@ class LLMService:
    def get_prompt(self, key: str, provider: str = None) -> str:
        """
        Hole provider-spezifisches Template mit intelligenter Text-Kaskade.
-        HINWEIS: Dies ist nur ein Text-Lookup und verbraucht kein API-Kontingent.
+        Kaskade: Gewählter Provider -> Gemini -> Ollama.
        Kaskade: Gewählter Provider -> Gemini (Cloud-Stil) -> Ollama (Basis-Stil).
        """
        active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
        data = self.prompts.get(key, "")
        if isinstance(data, dict):
            # Wir versuchen erst den Provider, dann Gemini, dann Ollama
            val = data.get(active_provider, data.get("gemini", data.get("ollama", "")))
            # Falls val durch YAML-Fehler immer noch ein Dict ist, extrahiere ersten String
            if isinstance(val, dict):
                logger.warning(f"⚠️ [LLMService] Nested dictionary detected for key '{key}'. Using first entry.")
                val = next(iter(val.values()), "") if val else ""
@ -132,8 +126,8 @@ class LLMService:
        strict_json_schema: bool = True
    ) -> str:
        """
-        Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung.
+        Haupteinstiegspunkt für LLM-Anfragen. 
-        Wendet die Bereinigung auf Text-Antworten an.
+        WP-25 FIX: Schwellenwert entfernt, um kurze Ingest-Validierungen (YES/NO) zu unterstützen.
        """
        target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
@ -151,8 +145,8 @@ class LLMService:
                json_schema, json_schema_name, strict_json_schema
            )
-        # WP-25 Empty Response Fix: Wenn Cloud-Provider leer antworten, Fallback auf Ollama
+        # WP-25 FIX: Nur noch auf absolut leere Antwort prüfen (ermöglicht YES/NO Antworten).
-        if (not res or len(res.strip()) < 5) and target_provider != "ollama":
+        if not res and target_provider != "ollama":
            logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Falling back to OLLAMA.")
            res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
@ -172,12 +166,8 @@ class LLMService:
        json_schema_name: str,
        strict_json_schema: bool
    ) -> str:
-        """
+        """Routet die Anfrage mit intelligenter Rate-Limit Erkennung."""
        Routet die Anfrage mit intelligenter Rate-Limit Erkennung.
        Nutzt max_retries um die Rate-Limit Schleife zu begrenzen.
        """
        rate_limit_attempts = 0
        # FIX: Wir nutzen max_retries als Limit für Rate-Limit Versuche, wenn explizit klein gewählt (z.B. Chat)
        max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
        wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)
@ -197,33 +187,24 @@ class LLMService:
                if provider == "gemini" and self.google_client:
                    return await self._execute_google(prompt, system, force_json, model_override)
                # Default/Fallback zu Ollama
                return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
            except Exception as e:
                err_str = str(e)
                # Intelligente 429 Erkennung
                is_rate_limit = any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited", "Too Many Requests"])
                if is_rate_limit and rate_limit_attempts < max_rate_retries:
                    rate_limit_attempts += 1
-                    logger.warning(
+                    logger.warning(f"⏳ Rate Limit from {provider}. Attempt {rate_limit_attempts}. Waiting {wait_time}s...")
                        f"⏳ [LLMService] Rate Limit detected from {provider}. "
                        f"Attempt {rate_limit_attempts}/{max_rate_retries}. Waiting {wait_time}s..."
                    )
                    await asyncio.sleep(wait_time)
                    continue 
                # Wenn kein Rate-Limit oder Retries erschöpft -> Fallback zu Ollama (falls aktiviert)
                if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
-                    logger.warning(
+                    logger.warning(f"🔄 Provider {provider} failed ({err_str}). Falling back to OLLAMA.")
                        f"🔄 Provider {provider} failed ({err_str}). Falling back to LOCAL OLLAMA."
                    )
                    return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay)
                raise e
    async def _execute_google(self, prompt, system, force_json, model_override):
        """Native Google SDK Integration (Gemini) mit v1 Fix."""
        model = model_override or self.settings.GEMINI_MODEL
        clean_model = model.replace("models/", "")
@ -250,7 +231,7 @@ class LLMService:
        json_schema_name: str = "mindnet_json",
        strict_json_schema: bool = True
    ) -> str:
-        """OpenRouter API Integration (OpenAI-kompatibel)."""
+        """OpenRouter API Integration. WP-25 FIX: Sicherung gegen leere 'choices'."""
        model = model_override or self.settings.OPENROUTER_MODEL
        messages = []
        if system:
@ -263,9 +244,7 @@ class LLMService:
                kwargs["response_format"] = {
                    "type": "json_schema",
                    "json_schema": {
-                        "name": json_schema_name,
+                        "name": json_schema_name, "strict": strict_json_schema, "schema": json_schema
                        "strict": strict_json_schema,
                        "schema": json_schema
                    }
                }
            else:
@ -276,23 +255,23 @@ class LLMService:
            messages=messages,
            **kwargs
        )
-        return response.choices[0].message.content.strip()
+        
        # WP-25 FIX: Sicherung gegen leere Antwort-Arrays
        if not response.choices or len(response.choices) == 0:
            logger.warning(f"🛰️ OpenRouter returned no choices for model {model}")
            return ""
        return response.choices[0].message.content.strip() if response.choices[0].message.content else ""
    async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay):
        """Lokaler Ollama Call mit striktem Retry-Limit."""
        payload = {
            "model": self.settings.LLM_MODEL,
            "prompt": prompt,
            "stream": False,
-            "options": {
+            "options": {"temperature": 0.1 if force_json else 0.7, "num_ctx": 8192}
                "temperature": 0.1 if force_json else 0.7,
                "num_ctx": 8192 # Begrenzung für Stabilität (WP-20)
            }
        }
-        if force_json:
+        if force_json: payload["format"] = "json"
-            payload["format"] = "json"
+        if system: payload["system"] = system
        if system:
            payload["system"] = system
        attempt = 0
        while True:
@ -302,27 +281,17 @@ class LLMService:
                return res.json().get("response", "").strip()
            except Exception as e:
                attempt += 1
                # WICHTIG: Wenn max_retries=0 (Chat), bricht dies nach dem 1. Versuch (attempt=1) sofort ab.
                if attempt > max_retries:
-                    logger.error(f"❌ Ollama request failed after {attempt} attempt(s): {e}")
+                    logger.error(f"❌ Ollama request failed: {e}")
                    raise e
                wait_time = base_delay * (2 ** (attempt - 1))
                logger.warning(f"⚠️ Ollama attempt {attempt} failed. Retrying in {wait_time}s...")
                await asyncio.sleep(wait_time)
    async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
-        """
+        """WP-25: Orchestrierung via DecisionEngine."""
-        WP-25 UPDATE: Der primäre Einstiegspunkt für den MindNet Chat.
+        logger.info(f"🚀 [WP-25] Chat Query: {query[:50]}...")
        Delegiert nun an die DecisionEngine für Agentic Multi-Stream RAG.
        Falls context_str bereits vorhanden ist (Legacy), wird dieser ignoriert zugunsten
        der präzisen Multi-Stream Orchestrierung.
        """
        logger.info(f"🚀 [WP-25] Chat Query intercepted: {query[:50]}...")
        # Die DecisionEngine übernimmt nun das gesamte Management (Routing, Retrieval, Synthesis)
        return await self.decision_engine.ask(query)
    async def close(self):
        """Schließt die HTTP-Verbindungen."""
        if self.ollama_client:
            await self.ollama_client.aclose()