mindnet/app/services/llm_service.py

"""
FILE: app/services/llm_service.py
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
             WP-25a: Implementierung der Mixture of Experts (MoE) Profil-Steuerung.
VERSION: 3.5.0 (WP-25a: MoE & Profile Orchestration)
STATUS: Active
FIX:
- WP-25a: Profilbasiertes Routing via llm_profiles.yaml.
- WP-25a: Unterstützung individueller Temperaturen pro Experten-Profil.
- WP-25: Beibehaltung der Ingest-Stability (kein Schwellenwert für YES/NO).
- WP-25: Erhalt der vollständigen v3.4.2 Resilienz-Logik.
"""
import httpx
import yaml
import logging
import asyncio
import json
from google import genai
from google.genai import types
from openai import AsyncOpenAI
from pathlib import Path
from typing import Optional, Dict, Any, Literal
from app.config import get_settings

# Import der neutralen Bereinigungs-Logik
from app.core.registry import clean_llm_text

logger = logging.getLogger(__name__)

class LLMService:
    _background_semaphore = None

    def __init__(self):
        self.settings = get_settings()
        self.prompts = self._load_prompts()

        # WP-25a: Zentrale Experten-Profile laden
        self.profiles = self._load_llm_profiles()

        self._decision_engine = None

        if LLMService._background_semaphore is None:
            limit = getattr(self.settings, "BACKGROUND_LIMIT", 2)
            logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}")
            LLMService._background_semaphore = asyncio.Semaphore(limit)

        # 1. Lokaler Ollama Client
        self.ollama_client = httpx.AsyncClient(
            base_url=self.settings.OLLAMA_URL,
            timeout=httpx.Timeout(self.settings.LLM_TIMEOUT)
        )

        # 2. Google GenAI Client
        self.google_client = None
        if self.settings.GOOGLE_API_KEY:
            self.google_client = genai.Client(
                api_key=self.settings.GOOGLE_API_KEY,
                http_options={'api_version': 'v1'}
            )
            logger.info("✨ LLMService: Google GenAI (Gemini) active.")

        # 3. OpenRouter Client
        self.openrouter_client = None
        if self.settings.OPENROUTER_API_KEY:
            self.openrouter_client = AsyncOpenAI(
                base_url="https://openrouter.ai/api/v1",
                api_key=self.settings.OPENROUTER_API_KEY,
                timeout=45.0
            )
            logger.info("🛰️ LLMService: OpenRouter Integration active.")

    @property
    def decision_engine(self):
        if self._decision_engine is None:
            from app.core.retrieval.decision_engine import DecisionEngine
            self._decision_engine = DecisionEngine()
        return self._decision_engine

    def _load_prompts(self) -> dict:
        path = Path(self.settings.PROMPTS_PATH)
        if not path.exists():
            return {}
        try:
            with open(path, "r", encoding="utf-8") as f:
                return yaml.safe_load(f) or {}
        except Exception as e:
            logger.error(f"❌ Failed to load prompts: {e}")
            return {}

    def _load_llm_profiles(self) -> dict:
        """WP-25a: Lädt die zentralen MoE-Profile aus der llm_profiles.yaml."""
        # Wir nutzen den in settings oder decision_engine definierten Pfad
        path_str = getattr(self.settings, "LLM_PROFILES_PATH", "config/llm_profiles.yaml")
        path = Path(path_str)
        if not path.exists():
            logger.warning(f"⚠️ LLM Profiles file not found at {path}. System will use .env defaults.")
            return {}
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = yaml.safe_load(f) or {}
                return data.get("profiles", {})
        except Exception as e:
            logger.error(f"❌ Failed to load llm_profiles.yaml: {e}")
            return {}

    def get_prompt(self, key: str, provider: str = None) -> str:
        active_provider = provider or self.settings.MINDNET_LLM_PROVIDER
        data = self.prompts.get(key, "")
        if isinstance(data, dict):
            val = data.get(active_provider, data.get("gemini", data.get("ollama", "")))
            return str(val)
        return str(data)

    async def generate_raw_response(
        self,
        prompt: str,
        system: str = None,
        force_json: bool = False,
        max_retries: int = 2,
        base_delay: float = 2.0,
        priority: Literal["realtime", "background"] = "realtime",
        provider: Optional[str] = None,
        model_override: Optional[str] = None,
        json_schema: Optional[Dict[str, Any]] = None,
        json_schema_name: str = "mindnet_json",
        strict_json_schema: bool = True,
        profile_name: Optional[str] = None # WP-25a
    ) -> str:
        """
        Haupteinstiegspunkt für LLM-Anfragen mit Profil-Unterstützung.
        """
        target_provider = provider
        target_model = model_override
        target_temp = None

        # WP-25a: Profil-Auflösung (Provider, Modell, Temperatur)
        if profile_name and self.profiles:
            profile = self.profiles.get(profile_name)
            if profile:
                target_provider = profile.get("provider", target_provider)
                target_model = profile.get("model", target_model)
                target_temp = profile.get("temperature")
                logger.debug(f"🎭 MoE Call: Profil '{profile_name}' -> {target_provider}")

        # Fallback auf Standard-Provider falls nichts übergeben/definiert wurde
        if not target_provider:
            target_provider = self.settings.MINDNET_LLM_PROVIDER

        if priority == "background":
            async with LLMService._background_semaphore:
                res = await self._dispatch(
                    target_provider, prompt, system, force_json,
                    max_retries, base_delay, target_model,
                    json_schema, json_schema_name, strict_json_schema, target_temp
                )
        else:
            res = await self._dispatch(
                target_provider, prompt, system, force_json,
                max_retries, base_delay, target_model,
                json_schema, json_schema_name, strict_json_schema, target_temp
            )

        # WP-25 Fix: Ingest-Stability (Ermöglicht YES/NO ohne Schwellenwert-Blockade)
        if not res and target_provider != "ollama":
            logger.warning(f"⚠️ [WP-25] Empty response from {target_provider}. Fallback to OLLAMA.")
            res = await self._execute_ollama(prompt, system, force_json, max_retries, base_delay, target_temp)

        return clean_llm_text(res) if not force_json else res

    async def _dispatch(
        self,
        provider: str,
        prompt: str,
        system: Optional[str],
        force_json: bool,
        max_retries: int,
        base_delay: float,
        model_override: Optional[str],
        json_schema: Optional[Dict[str, Any]],
        json_schema_name: str,
        strict_json_schema: bool,
        temperature: Optional[float] = None # WP-25a
    ) -> str:
        """Routet die Anfrage mit Rate-Limit Erkennung."""
        rate_limit_attempts = 0
        max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
        wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)

        while rate_limit_attempts <= max_rate_retries:
            try:
                if provider == "openrouter" and self.openrouter_client:
                    return await self._execute_openrouter(
                        prompt=prompt, system=system, force_json=force_json,
                        model_override=model_override, json_schema=json_schema,
                        json_schema_name=json_schema_name, strict_json_schema=strict_json_schema,
                        temperature=temperature
                    )

                if provider == "gemini" and self.google_client:
                    return await self._execute_google(prompt, system, force_json, model_override, temperature)

                return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay, temperature)

            except Exception as e:
                err_str = str(e)
                if any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited"]):
                    rate_limit_attempts += 1
                    logger.warning(f"⏳ Rate Limit {provider}. Attempt {rate_limit_attempts}. Wait {wait_time}s.")
                    await asyncio.sleep(wait_time)
                    continue

                if self.settings.LLM_FALLBACK_ENABLED and provider != "ollama":
                    return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay, temperature)
                raise e

    async def _execute_google(self, prompt, system, force_json, model_override, temperature):
        model = model_override or self.settings.GEMINI_MODEL
        clean_model = model.replace("models/", "")

        config_kwargs = {
            "system_instruction": system,
            "response_mime_type": "application/json" if force_json else "text/plain"
        }
        if temperature is not None:
            config_kwargs["temperature"] = temperature

        config = types.GenerateContentConfig(**config_kwargs)

        response = await asyncio.wait_for(
            asyncio.to_thread(
                self.google_client.models.generate_content,
                model=clean_model, contents=prompt, config=config
            ),
            timeout=45.0
        )
        return response.text.strip()

    async def _execute_openrouter(
        self, prompt: str, system: Optional[str], force_json: bool,
        model_override: Optional[str], json_schema: Optional[Dict[str, Any]] = None,
        json_schema_name: str = "mindnet_json", strict_json_schema: bool = True,
        temperature: Optional[float] = None
    ) -> str:
        model = model_override or self.settings.OPENROUTER_MODEL
        messages = []
        if system: messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})

        kwargs: Dict[str, Any] = {}
        if temperature is not None:
            kwargs["temperature"] = temperature

        if force_json:
            if json_schema:
                kwargs["response_format"] = {
                    "type": "json_schema",
                    "json_schema": {"name": json_schema_name, "strict": strict_json_schema, "schema": json_schema}
                }
            else:
                kwargs["response_format"] = {"type": "json_object"}

        response = await self.openrouter_client.chat.completions.create(
            model=model, messages=messages, **kwargs
        )

        if not response.choices:
            return ""

        return response.choices[0].message.content.strip() if response.choices[0].message.content else ""

    async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay, temperature=None):
        # WP-25a: Nutzt Profil-Temperatur oder Standard
        effective_temp = temperature if temperature is not None else (0.1 if force_json else 0.7)

        payload = {
            "model": self.settings.LLM_MODEL,
            "prompt": prompt,
            "stream": False,
            "options": {"temperature": effective_temp, "num_ctx": 8192}
        }
        if force_json: payload["format"] = "json"
        if system: payload["system"] = system

        attempt = 0
        while True:
            try:
                res = await self.ollama_client.post("/api/generate", json=payload)
                res.raise_for_status()
                return res.json().get("response", "").strip()
            except Exception as e:
                attempt += 1
                if attempt > max_retries: raise e
                await asyncio.sleep(base_delay * (2 ** (attempt - 1)))

    async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
        """WP-25: Orchestrierung via DecisionEngine."""
        return await self.decision_engine.ask(query)

    async def close(self):
        if self.ollama_client:
            await self.ollama_client.aclose()