WP06 #4
|
|
@ -1,5 +1,6 @@
|
|||
"""
|
||||
app/routers/chat.py — RAG Endpunkt (WP-06 Hybrid Router)
|
||||
Version: 0.2.1 (Fix: System Prompt Separation)
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
|
|
@ -23,7 +24,6 @@ logger = logging.getLogger(__name__)
|
|||
_DECISION_CONFIG_CACHE = None
|
||||
|
||||
def _load_decision_config() -> Dict[str, Any]:
|
||||
"""Lädt die Decision-Engine Konfiguration (Late Binding)."""
|
||||
settings = get_settings()
|
||||
path = Path(settings.DECISION_CONFIG_PATH)
|
||||
default_config = {
|
||||
|
|
@ -88,11 +88,6 @@ def _build_enriched_context(hits: List[QueryHit]) -> str:
|
|||
return "\n\n".join(context_parts)
|
||||
|
||||
async def _classify_intent(query: str, llm: LLMService) -> str:
|
||||
"""
|
||||
Hybrid Router:
|
||||
1. Keyword Check (Best/Longest Match) -> FAST
|
||||
2. LLM Fallback (wenn in config aktiv) -> SMART
|
||||
"""
|
||||
config = get_full_config()
|
||||
strategies = config.get("strategies", {})
|
||||
settings = config.get("settings", {})
|
||||
|
|
@ -101,7 +96,7 @@ async def _classify_intent(query: str, llm: LLMService) -> str:
|
|||
best_intent = None
|
||||
max_match_length = 0
|
||||
|
||||
# 1. FAST PATH: Keywords
|
||||
# 1. FAST PATH
|
||||
for intent_name, strategy in strategies.items():
|
||||
if intent_name == "FACT": continue
|
||||
keywords = strategy.get("trigger_keywords", [])
|
||||
|
|
@ -115,23 +110,21 @@ async def _classify_intent(query: str, llm: LLMService) -> str:
|
|||
logger.info(f"Intent detected via KEYWORD: {best_intent}")
|
||||
return best_intent
|
||||
|
||||
# 2. SLOW PATH: LLM Router
|
||||
# 2. SLOW PATH
|
||||
if settings.get("llm_fallback_enabled", False):
|
||||
router_prompt_template = settings.get("llm_router_prompt", "")
|
||||
if router_prompt_template:
|
||||
prompt = router_prompt_template.replace("{query}", query)
|
||||
logger.info("Keywords failed. Asking LLM for Intent...")
|
||||
|
||||
# Kurzer Raw Call
|
||||
# Router braucht keinen System-Prompt, nur den Classifier-Prompt
|
||||
llm_decision = await llm.generate_raw_response(prompt)
|
||||
|
||||
# Cleaning
|
||||
llm_decision = llm_decision.strip().upper()
|
||||
if ":" in llm_decision:
|
||||
llm_decision = llm_decision.split(":")[-1].strip()
|
||||
|
||||
# Validierung: Nur bekannte Intents zulassen
|
||||
# Entferne Satzzeichen
|
||||
# Satzzeichen entfernen für sauberen Match
|
||||
llm_decision = ''.join(filter(str.isalnum, llm_decision))
|
||||
|
||||
if llm_decision in strategies:
|
||||
|
|
@ -185,7 +178,6 @@ async def chat_endpoint(
|
|||
)
|
||||
strategy_result = await retriever.search(strategy_req)
|
||||
|
||||
# Merge
|
||||
existing_ids = {h.node_id for h in hits}
|
||||
for strat_hit in strategy_result.results:
|
||||
if strat_hit.node_id not in existing_ids:
|
||||
|
|
@ -198,27 +190,18 @@ async def chat_endpoint(
|
|||
context_str = _build_enriched_context(hits)
|
||||
|
||||
# 5. Generation
|
||||
# Wir laden das Template aus dem Service (da dort die prompts.yaml geladen ist)
|
||||
template = llm.prompts.get(prompt_key, "{context_str}\n\n{query}")
|
||||
system_prompt = llm.prompts.get("system_prompt", "")
|
||||
|
||||
if prepend_instr:
|
||||
context_str = f"{prepend_instr}\n\n{context_str}"
|
||||
|
||||
# Manuelles Bauen des finalen Prompts für volle Kontrolle
|
||||
final_prompt = template.replace("{context_str}", context_str).replace("{query}", request.message)
|
||||
|
||||
# Aufruf via Raw Response (da wir den Prompt schon fertig haben)
|
||||
# Wir müssen den System-Prompt manuell mitgeben?
|
||||
# generate_raw_response in llm_service unterstützt aktuell kein 'system'.
|
||||
# -> Wir erweitern generate_raw_response oder nutzen einen Hack: System + Prompt.
|
||||
|
||||
# SAUBERER WEG: Wir bauen den Payload für Ollama hier manuell zusammen und rufen eine generische Methode.
|
||||
# Da LLMService.generate_raw_response keine System-Msg nimmt, packen wir sie davor.
|
||||
full_text_prompt = f"{system_prompt}\n\n{final_prompt}"
|
||||
|
||||
logger.info(f"[{query_id}] Sending to LLM (Intent: {intent}, Template: {prompt_key})...")
|
||||
answer_text = await llm.generate_raw_response(full_text_prompt)
|
||||
|
||||
# FIX: System-Prompt separat übergeben!
|
||||
answer_text = await llm.generate_raw_response(prompt=final_prompt, system=system_prompt)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""
|
||||
app/services/llm_service.py — LLM Client (Ollama)
|
||||
Version: 0.2.0 (WP-06 Hybrid Router Support)
|
||||
Version: 0.2.1 (Fix: System Prompt Handling for Phi-3)
|
||||
"""
|
||||
|
||||
import httpx
|
||||
|
|
@ -17,7 +17,6 @@ class LLMService:
|
|||
self.settings = get_settings()
|
||||
self.prompts = self._load_prompts()
|
||||
|
||||
# Timeout aus Config nutzen (Default 120s)
|
||||
self.client = httpx.AsyncClient(
|
||||
base_url=self.settings.OLLAMA_URL,
|
||||
timeout=self.settings.LLM_TIMEOUT
|
||||
|
|
@ -34,21 +33,27 @@ class LLMService:
|
|||
logger.error(f"Failed to load prompts: {e}")
|
||||
return {}
|
||||
|
||||
async def generate_raw_response(self, prompt: str) -> str:
|
||||
async def generate_raw_response(self, prompt: str, system: str = None) -> str:
|
||||
"""
|
||||
NEU: Führt einen direkten LLM Call ohne RAG-Template aus.
|
||||
Wird vom Router für die Antwortgenerierung genutzt.
|
||||
Führt einen LLM Call aus.
|
||||
Unterstützt nun explizite System-Prompts für sauberes Templating.
|
||||
"""
|
||||
payload = {
|
||||
"model": self.settings.LLM_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.0,
|
||||
"num_ctx": 512
|
||||
# Temperature etwas höher für Empathie, niedriger für Code?
|
||||
# Wir lassen es auf Standard, oder steuern es später via Config.
|
||||
"temperature": 0.7,
|
||||
"num_ctx": 2048
|
||||
}
|
||||
}
|
||||
|
||||
# WICHTIG: System-Prompt separat übergeben, damit Ollama formatiert
|
||||
if system:
|
||||
payload["system"] = system
|
||||
|
||||
try:
|
||||
response = await self.client.post("/api/generate", json=payload)
|
||||
if response.status_code != 200:
|
||||
|
|
@ -63,14 +68,13 @@ class LLMService:
|
|||
return "Interner LLM Fehler."
|
||||
|
||||
async def generate_rag_response(self, query: str, context_str: str) -> str:
|
||||
"""Legacy Support / Fallback"""
|
||||
"""Legacy Support"""
|
||||
system_prompt = self.prompts.get("system_prompt", "")
|
||||
rag_template = self.prompts.get("rag_template", "{context_str}\n\n{query}")
|
||||
final_prompt = rag_template.format(context_str=context_str, query=query)
|
||||
|
||||
# Wir nutzen intern nun auch raw_response, um Code zu sparen
|
||||
full_prompt = f"{system_prompt}\n\n{final_prompt}"
|
||||
return await self.generate_raw_response(full_prompt)
|
||||
# Leite an die neue Methode weiter
|
||||
return await self.generate_raw_response(final_prompt, system=system_prompt)
|
||||
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
Loading…
Reference in New Issue
Block a user