diff --git a/app/routers/chat.py b/app/routers/chat.py index 8cb5bc9..984e810 100644 --- a/app/routers/chat.py +++ b/app/routers/chat.py @@ -1,9 +1,12 @@ """ FILE: app/routers/chat.py DESCRIPTION: Haupt-Chat-Interface (RAG & Interview). Enthält Intent-Router (Keywords/LLM) und Prompt-Construction. -VERSION: 2.7.3 (Debug & Deep Fallback Edition) +VERSION: 2.7.8 (Full Unabridged Stability Edition) STATUS: Active -FIX: Integriert erweiterte Debug-Logs zur Analyse von Context-Overflow und Silent Refusals. +FIX: +1. Implementiert Context-Throttling für Ollama (MAX_OLLAMA_CHARS). +2. Deaktiviert LLM-Retries für den Chat (max_retries=0). +3. Behebt Double-Fallback-Schleifen und Silent Refusals. """ from fastapi import APIRouter, HTTPException, Depends @@ -205,7 +208,8 @@ async def _classify_intent(query: str, llm: LLMService) -> tuple[str, str]: logger.info("Keywords failed (or Question detected). Asking LLM for Intent...") try: - raw_response = await llm.generate_raw_response(prompt, priority="realtime") + # FIX: Auch beim Routing keine Retries im Chat-Fluss + raw_response = await llm.generate_raw_response(prompt, priority="realtime", max_retries=0) llm_output_upper = raw_response.upper() if "INTERVIEW" in llm_output_upper or "CREATE" in llm_output_upper: @@ -238,10 +242,11 @@ async def chat_endpoint( # Strategy Load strategy = get_decision_strategy(intent) prompt_key = strategy.get("prompt_template", "rag_template") - preferred_provider = strategy.get("preferred_provider") # Nutzt Konfiguration aus decision_engine.yaml + preferred_provider = strategy.get("preferred_provider") sources_hits = [] final_prompt = "" + context_str = "" if intent == "INTERVIEW": # --- INTERVIEW MODE --- @@ -302,6 +307,14 @@ async def chat_endpoint( hits.append(strat_hit) context_str = _build_enriched_context(hits) if hits else "Keine relevanten Notizen gefunden." + + # --- STABILITY FIX: OLLAMA CONTEXT THROTTLE --- + # Begrenzt den Text, um den "decode: cannot decode batches" Fehler zu vermeiden. + MAX_OLLAMA_CHARS = 10000 + if preferred_provider == "ollama" and len(context_str) > MAX_OLLAMA_CHARS: + logger.warning(f"⚠️ [{query_id}] Context zu groß für Ollama ({len(context_str)} chars). Kürze auf {MAX_OLLAMA_CHARS}.") + context_str = context_str[:MAX_OLLAMA_CHARS] + "\n[...gekürzt zur Stabilität...]" + template = llm.get_prompt(prompt_key) or "{context_str}\n\n{query}" if prepend_instr: @@ -315,31 +328,41 @@ async def chat_endpoint( if not final_prompt.strip(): logger.error(f"[{query_id}] CRITICAL: Final prompt is empty before sending to LLM!") - # --- GENERATION MIT DEEP FALLBACK --- + # --- GENERATION WITH NO-RETRY & DEEP FALLBACK --- system_prompt = llm.get_prompt("system_prompt") # --- DEBUG SPOT 2: PRIMARY CALL --- - logger.info(f"[{query_id}] PRIMARY CALL: Sending request to provider '{preferred_provider}'...") + logger.info(f"[{query_id}] PRIMARY CALL: Sending request to provider '{preferred_provider}' (No Retries)...") - # 1. Versuch mit konfiguriertem Provider (z.B. Ollama für EMPATHY) - answer_text = await llm.generate_raw_response( - prompt=final_prompt, - system=system_prompt, - priority="realtime", - provider=preferred_provider - ) - - # DEEP FALLBACK: Wenn die Antwort leer ist (Silent Refusal in der Cloud) - if not answer_text.strip() and preferred_provider != "ollama": - # --- DEBUG SPOT 3: FALLBACK TRIGGER --- - logger.warning(f"🛑 [{query_id}] PRIMARY '{preferred_provider}' returned EMPTY. Triggering Deep Fallback to Ollama...") - + answer_text = "" + try: + # FIX: max_retries=0 verhindert Hänger durch Retry-Kaskaden im Chat answer_text = await llm.generate_raw_response( prompt=final_prompt, system=system_prompt, priority="realtime", - provider="ollama" + provider=preferred_provider, + max_retries=0 ) + except Exception as e: + logger.error(f"🛑 [{query_id}] Primary Provider '{preferred_provider}' failed: {e}") + + # DEEP FALLBACK: Wenn die Antwort leer ist (Silent Refusal) oder der Primary abgestürzt ist + if not answer_text.strip() and preferred_provider != "ollama": + # --- DEBUG SPOT 3: FALLBACK TRIGGER --- + logger.warning(f"🛑 [{query_id}] PRIMARY '{preferred_provider}' returned EMPTY or FAILED. Triggering Deep Fallback to Ollama...") + + try: + answer_text = await llm.generate_raw_response( + prompt=final_prompt, + system=system_prompt, + priority="realtime", + provider="ollama", + max_retries=0 + ) + except Exception as e: + logger.error(f"🛑 [{query_id}] Deep Fallback to Ollama also failed: {e}") + answer_text = "Entschuldigung, das System ist aktuell überlastet. Bitte versuche es in einem Moment erneut." duration_ms = int((time.time() - start_time) * 1000) @@ -365,4 +388,5 @@ async def chat_endpoint( except Exception as e: logger.error(f"Error in chat endpoint: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file + # Wir geben eine benutzerfreundliche Meldung zurück, statt nur den Error-Stack + raise HTTPException(status_code=500, detail="Das System konnte die Anfrage nicht verarbeiten.") \ No newline at end of file