Die Context-Drossel (10.000 Zeichen), die Deaktivierung der Retry-Kaskaden (max_retries=0) für den Echtzeit-Chat und eine robustere Fehlerbehandlung für die LLM-Aufrufe.
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 6s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 6s
This commit is contained in:
parent
470e653da6
commit
cbaf664123
|
|
@ -1,9 +1,12 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/routers/chat.py
|
FILE: app/routers/chat.py
|
||||||
DESCRIPTION: Haupt-Chat-Interface (RAG & Interview). Enthält Intent-Router (Keywords/LLM) und Prompt-Construction.
|
DESCRIPTION: Haupt-Chat-Interface (RAG & Interview). Enthält Intent-Router (Keywords/LLM) und Prompt-Construction.
|
||||||
VERSION: 2.7.3 (Debug & Deep Fallback Edition)
|
VERSION: 2.7.8 (Full Unabridged Stability Edition)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
FIX: Integriert erweiterte Debug-Logs zur Analyse von Context-Overflow und Silent Refusals.
|
FIX:
|
||||||
|
1. Implementiert Context-Throttling für Ollama (MAX_OLLAMA_CHARS).
|
||||||
|
2. Deaktiviert LLM-Retries für den Chat (max_retries=0).
|
||||||
|
3. Behebt Double-Fallback-Schleifen und Silent Refusals.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Depends
|
from fastapi import APIRouter, HTTPException, Depends
|
||||||
|
|
@ -205,7 +208,8 @@ async def _classify_intent(query: str, llm: LLMService) -> tuple[str, str]:
|
||||||
logger.info("Keywords failed (or Question detected). Asking LLM for Intent...")
|
logger.info("Keywords failed (or Question detected). Asking LLM for Intent...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
raw_response = await llm.generate_raw_response(prompt, priority="realtime")
|
# FIX: Auch beim Routing keine Retries im Chat-Fluss
|
||||||
|
raw_response = await llm.generate_raw_response(prompt, priority="realtime", max_retries=0)
|
||||||
llm_output_upper = raw_response.upper()
|
llm_output_upper = raw_response.upper()
|
||||||
|
|
||||||
if "INTERVIEW" in llm_output_upper or "CREATE" in llm_output_upper:
|
if "INTERVIEW" in llm_output_upper or "CREATE" in llm_output_upper:
|
||||||
|
|
@ -238,10 +242,11 @@ async def chat_endpoint(
|
||||||
# Strategy Load
|
# Strategy Load
|
||||||
strategy = get_decision_strategy(intent)
|
strategy = get_decision_strategy(intent)
|
||||||
prompt_key = strategy.get("prompt_template", "rag_template")
|
prompt_key = strategy.get("prompt_template", "rag_template")
|
||||||
preferred_provider = strategy.get("preferred_provider") # Nutzt Konfiguration aus decision_engine.yaml
|
preferred_provider = strategy.get("preferred_provider")
|
||||||
|
|
||||||
sources_hits = []
|
sources_hits = []
|
||||||
final_prompt = ""
|
final_prompt = ""
|
||||||
|
context_str = ""
|
||||||
|
|
||||||
if intent == "INTERVIEW":
|
if intent == "INTERVIEW":
|
||||||
# --- INTERVIEW MODE ---
|
# --- INTERVIEW MODE ---
|
||||||
|
|
@ -302,6 +307,14 @@ async def chat_endpoint(
|
||||||
hits.append(strat_hit)
|
hits.append(strat_hit)
|
||||||
|
|
||||||
context_str = _build_enriched_context(hits) if hits else "Keine relevanten Notizen gefunden."
|
context_str = _build_enriched_context(hits) if hits else "Keine relevanten Notizen gefunden."
|
||||||
|
|
||||||
|
# --- STABILITY FIX: OLLAMA CONTEXT THROTTLE ---
|
||||||
|
# Begrenzt den Text, um den "decode: cannot decode batches" Fehler zu vermeiden.
|
||||||
|
MAX_OLLAMA_CHARS = 10000
|
||||||
|
if preferred_provider == "ollama" and len(context_str) > MAX_OLLAMA_CHARS:
|
||||||
|
logger.warning(f"⚠️ [{query_id}] Context zu groß für Ollama ({len(context_str)} chars). Kürze auf {MAX_OLLAMA_CHARS}.")
|
||||||
|
context_str = context_str[:MAX_OLLAMA_CHARS] + "\n[...gekürzt zur Stabilität...]"
|
||||||
|
|
||||||
template = llm.get_prompt(prompt_key) or "{context_str}\n\n{query}"
|
template = llm.get_prompt(prompt_key) or "{context_str}\n\n{query}"
|
||||||
|
|
||||||
if prepend_instr:
|
if prepend_instr:
|
||||||
|
|
@ -315,31 +328,41 @@ async def chat_endpoint(
|
||||||
if not final_prompt.strip():
|
if not final_prompt.strip():
|
||||||
logger.error(f"[{query_id}] CRITICAL: Final prompt is empty before sending to LLM!")
|
logger.error(f"[{query_id}] CRITICAL: Final prompt is empty before sending to LLM!")
|
||||||
|
|
||||||
# --- GENERATION MIT DEEP FALLBACK ---
|
# --- GENERATION WITH NO-RETRY & DEEP FALLBACK ---
|
||||||
system_prompt = llm.get_prompt("system_prompt")
|
system_prompt = llm.get_prompt("system_prompt")
|
||||||
|
|
||||||
# --- DEBUG SPOT 2: PRIMARY CALL ---
|
# --- DEBUG SPOT 2: PRIMARY CALL ---
|
||||||
logger.info(f"[{query_id}] PRIMARY CALL: Sending request to provider '{preferred_provider}'...")
|
logger.info(f"[{query_id}] PRIMARY CALL: Sending request to provider '{preferred_provider}' (No Retries)...")
|
||||||
|
|
||||||
# 1. Versuch mit konfiguriertem Provider (z.B. Ollama für EMPATHY)
|
answer_text = ""
|
||||||
answer_text = await llm.generate_raw_response(
|
try:
|
||||||
prompt=final_prompt,
|
# FIX: max_retries=0 verhindert Hänger durch Retry-Kaskaden im Chat
|
||||||
system=system_prompt,
|
|
||||||
priority="realtime",
|
|
||||||
provider=preferred_provider
|
|
||||||
)
|
|
||||||
|
|
||||||
# DEEP FALLBACK: Wenn die Antwort leer ist (Silent Refusal in der Cloud)
|
|
||||||
if not answer_text.strip() and preferred_provider != "ollama":
|
|
||||||
# --- DEBUG SPOT 3: FALLBACK TRIGGER ---
|
|
||||||
logger.warning(f"🛑 [{query_id}] PRIMARY '{preferred_provider}' returned EMPTY. Triggering Deep Fallback to Ollama...")
|
|
||||||
|
|
||||||
answer_text = await llm.generate_raw_response(
|
answer_text = await llm.generate_raw_response(
|
||||||
prompt=final_prompt,
|
prompt=final_prompt,
|
||||||
system=system_prompt,
|
system=system_prompt,
|
||||||
priority="realtime",
|
priority="realtime",
|
||||||
provider="ollama"
|
provider=preferred_provider,
|
||||||
|
max_retries=0
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"🛑 [{query_id}] Primary Provider '{preferred_provider}' failed: {e}")
|
||||||
|
|
||||||
|
# DEEP FALLBACK: Wenn die Antwort leer ist (Silent Refusal) oder der Primary abgestürzt ist
|
||||||
|
if not answer_text.strip() and preferred_provider != "ollama":
|
||||||
|
# --- DEBUG SPOT 3: FALLBACK TRIGGER ---
|
||||||
|
logger.warning(f"🛑 [{query_id}] PRIMARY '{preferred_provider}' returned EMPTY or FAILED. Triggering Deep Fallback to Ollama...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
answer_text = await llm.generate_raw_response(
|
||||||
|
prompt=final_prompt,
|
||||||
|
system=system_prompt,
|
||||||
|
priority="realtime",
|
||||||
|
provider="ollama",
|
||||||
|
max_retries=0
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"🛑 [{query_id}] Deep Fallback to Ollama also failed: {e}")
|
||||||
|
answer_text = "Entschuldigung, das System ist aktuell überlastet. Bitte versuche es in einem Moment erneut."
|
||||||
|
|
||||||
duration_ms = int((time.time() - start_time) * 1000)
|
duration_ms = int((time.time() - start_time) * 1000)
|
||||||
|
|
||||||
|
|
@ -365,4 +388,5 @@ async def chat_endpoint(
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in chat endpoint: {e}", exc_info=True)
|
logger.error(f"Error in chat endpoint: {e}", exc_info=True)
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
# Wir geben eine benutzerfreundliche Meldung zurück, statt nur den Error-Stack
|
||||||
|
raise HTTPException(status_code=500, detail="Das System konnte die Anfrage nicht verarbeiten.")
|
||||||
Loading…
Reference in New Issue
Block a user