Die Context-Drossel (10.000 Zeichen), die Deaktivierung der Retry-Kaskaden (max_retries=0) für den Echtzeit-Chat und eine robustere Fehlerbehandlung für die LLM-Aufrufe.
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 6s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 6s
This commit is contained in:
parent
470e653da6
commit
cbaf664123
|
|
@ -1,9 +1,12 @@
|
|||
"""
|
||||
FILE: app/routers/chat.py
|
||||
DESCRIPTION: Haupt-Chat-Interface (RAG & Interview). Enthält Intent-Router (Keywords/LLM) und Prompt-Construction.
|
||||
VERSION: 2.7.3 (Debug & Deep Fallback Edition)
|
||||
VERSION: 2.7.8 (Full Unabridged Stability Edition)
|
||||
STATUS: Active
|
||||
FIX: Integriert erweiterte Debug-Logs zur Analyse von Context-Overflow und Silent Refusals.
|
||||
FIX:
|
||||
1. Implementiert Context-Throttling für Ollama (MAX_OLLAMA_CHARS).
|
||||
2. Deaktiviert LLM-Retries für den Chat (max_retries=0).
|
||||
3. Behebt Double-Fallback-Schleifen und Silent Refusals.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
|
|
@ -205,7 +208,8 @@ async def _classify_intent(query: str, llm: LLMService) -> tuple[str, str]:
|
|||
logger.info("Keywords failed (or Question detected). Asking LLM for Intent...")
|
||||
|
||||
try:
|
||||
raw_response = await llm.generate_raw_response(prompt, priority="realtime")
|
||||
# FIX: Auch beim Routing keine Retries im Chat-Fluss
|
||||
raw_response = await llm.generate_raw_response(prompt, priority="realtime", max_retries=0)
|
||||
llm_output_upper = raw_response.upper()
|
||||
|
||||
if "INTERVIEW" in llm_output_upper or "CREATE" in llm_output_upper:
|
||||
|
|
@ -238,10 +242,11 @@ async def chat_endpoint(
|
|||
# Strategy Load
|
||||
strategy = get_decision_strategy(intent)
|
||||
prompt_key = strategy.get("prompt_template", "rag_template")
|
||||
preferred_provider = strategy.get("preferred_provider") # Nutzt Konfiguration aus decision_engine.yaml
|
||||
preferred_provider = strategy.get("preferred_provider")
|
||||
|
||||
sources_hits = []
|
||||
final_prompt = ""
|
||||
context_str = ""
|
||||
|
||||
if intent == "INTERVIEW":
|
||||
# --- INTERVIEW MODE ---
|
||||
|
|
@ -302,6 +307,14 @@ async def chat_endpoint(
|
|||
hits.append(strat_hit)
|
||||
|
||||
context_str = _build_enriched_context(hits) if hits else "Keine relevanten Notizen gefunden."
|
||||
|
||||
# --- STABILITY FIX: OLLAMA CONTEXT THROTTLE ---
|
||||
# Begrenzt den Text, um den "decode: cannot decode batches" Fehler zu vermeiden.
|
||||
MAX_OLLAMA_CHARS = 10000
|
||||
if preferred_provider == "ollama" and len(context_str) > MAX_OLLAMA_CHARS:
|
||||
logger.warning(f"⚠️ [{query_id}] Context zu groß für Ollama ({len(context_str)} chars). Kürze auf {MAX_OLLAMA_CHARS}.")
|
||||
context_str = context_str[:MAX_OLLAMA_CHARS] + "\n[...gekürzt zur Stabilität...]"
|
||||
|
||||
template = llm.get_prompt(prompt_key) or "{context_str}\n\n{query}"
|
||||
|
||||
if prepend_instr:
|
||||
|
|
@ -315,31 +328,41 @@ async def chat_endpoint(
|
|||
if not final_prompt.strip():
|
||||
logger.error(f"[{query_id}] CRITICAL: Final prompt is empty before sending to LLM!")
|
||||
|
||||
# --- GENERATION MIT DEEP FALLBACK ---
|
||||
# --- GENERATION WITH NO-RETRY & DEEP FALLBACK ---
|
||||
system_prompt = llm.get_prompt("system_prompt")
|
||||
|
||||
# --- DEBUG SPOT 2: PRIMARY CALL ---
|
||||
logger.info(f"[{query_id}] PRIMARY CALL: Sending request to provider '{preferred_provider}'...")
|
||||
logger.info(f"[{query_id}] PRIMARY CALL: Sending request to provider '{preferred_provider}' (No Retries)...")
|
||||
|
||||
# 1. Versuch mit konfiguriertem Provider (z.B. Ollama für EMPATHY)
|
||||
answer_text = await llm.generate_raw_response(
|
||||
prompt=final_prompt,
|
||||
system=system_prompt,
|
||||
priority="realtime",
|
||||
provider=preferred_provider
|
||||
)
|
||||
|
||||
# DEEP FALLBACK: Wenn die Antwort leer ist (Silent Refusal in der Cloud)
|
||||
if not answer_text.strip() and preferred_provider != "ollama":
|
||||
# --- DEBUG SPOT 3: FALLBACK TRIGGER ---
|
||||
logger.warning(f"🛑 [{query_id}] PRIMARY '{preferred_provider}' returned EMPTY. Triggering Deep Fallback to Ollama...")
|
||||
|
||||
answer_text = ""
|
||||
try:
|
||||
# FIX: max_retries=0 verhindert Hänger durch Retry-Kaskaden im Chat
|
||||
answer_text = await llm.generate_raw_response(
|
||||
prompt=final_prompt,
|
||||
system=system_prompt,
|
||||
priority="realtime",
|
||||
provider="ollama"
|
||||
provider=preferred_provider,
|
||||
max_retries=0
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"🛑 [{query_id}] Primary Provider '{preferred_provider}' failed: {e}")
|
||||
|
||||
# DEEP FALLBACK: Wenn die Antwort leer ist (Silent Refusal) oder der Primary abgestürzt ist
|
||||
if not answer_text.strip() and preferred_provider != "ollama":
|
||||
# --- DEBUG SPOT 3: FALLBACK TRIGGER ---
|
||||
logger.warning(f"🛑 [{query_id}] PRIMARY '{preferred_provider}' returned EMPTY or FAILED. Triggering Deep Fallback to Ollama...")
|
||||
|
||||
try:
|
||||
answer_text = await llm.generate_raw_response(
|
||||
prompt=final_prompt,
|
||||
system=system_prompt,
|
||||
priority="realtime",
|
||||
provider="ollama",
|
||||
max_retries=0
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"🛑 [{query_id}] Deep Fallback to Ollama also failed: {e}")
|
||||
answer_text = "Entschuldigung, das System ist aktuell überlastet. Bitte versuche es in einem Moment erneut."
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
|
|
@ -365,4 +388,5 @@ async def chat_endpoint(
|
|||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in chat endpoint: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
# Wir geben eine benutzerfreundliche Meldung zurück, statt nur den Error-Stack
|
||||
raise HTTPException(status_code=500, detail="Das System konnte die Anfrage nicht verarbeiten.")
|
||||
Loading…
Reference in New Issue
Block a user