WP06 #4
|
|
@ -1,5 +1,6 @@
|
||||||
"""
|
"""
|
||||||
app/routers/chat.py — RAG Endpunkt (WP-06 Hybrid Router)
|
app/routers/chat.py — RAG Endpunkt (WP-06 Hybrid Router)
|
||||||
|
Version: 0.2.1 (Fix: System Prompt Separation)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Depends
|
from fastapi import APIRouter, HTTPException, Depends
|
||||||
|
|
@ -23,7 +24,6 @@ logger = logging.getLogger(__name__)
|
||||||
_DECISION_CONFIG_CACHE = None
|
_DECISION_CONFIG_CACHE = None
|
||||||
|
|
||||||
def _load_decision_config() -> Dict[str, Any]:
|
def _load_decision_config() -> Dict[str, Any]:
|
||||||
"""Lädt die Decision-Engine Konfiguration (Late Binding)."""
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
path = Path(settings.DECISION_CONFIG_PATH)
|
path = Path(settings.DECISION_CONFIG_PATH)
|
||||||
default_config = {
|
default_config = {
|
||||||
|
|
@ -88,11 +88,6 @@ def _build_enriched_context(hits: List[QueryHit]) -> str:
|
||||||
return "\n\n".join(context_parts)
|
return "\n\n".join(context_parts)
|
||||||
|
|
||||||
async def _classify_intent(query: str, llm: LLMService) -> str:
|
async def _classify_intent(query: str, llm: LLMService) -> str:
|
||||||
"""
|
|
||||||
Hybrid Router:
|
|
||||||
1. Keyword Check (Best/Longest Match) -> FAST
|
|
||||||
2. LLM Fallback (wenn in config aktiv) -> SMART
|
|
||||||
"""
|
|
||||||
config = get_full_config()
|
config = get_full_config()
|
||||||
strategies = config.get("strategies", {})
|
strategies = config.get("strategies", {})
|
||||||
settings = config.get("settings", {})
|
settings = config.get("settings", {})
|
||||||
|
|
@ -101,7 +96,7 @@ async def _classify_intent(query: str, llm: LLMService) -> str:
|
||||||
best_intent = None
|
best_intent = None
|
||||||
max_match_length = 0
|
max_match_length = 0
|
||||||
|
|
||||||
# 1. FAST PATH: Keywords
|
# 1. FAST PATH
|
||||||
for intent_name, strategy in strategies.items():
|
for intent_name, strategy in strategies.items():
|
||||||
if intent_name == "FACT": continue
|
if intent_name == "FACT": continue
|
||||||
keywords = strategy.get("trigger_keywords", [])
|
keywords = strategy.get("trigger_keywords", [])
|
||||||
|
|
@ -115,23 +110,21 @@ async def _classify_intent(query: str, llm: LLMService) -> str:
|
||||||
logger.info(f"Intent detected via KEYWORD: {best_intent}")
|
logger.info(f"Intent detected via KEYWORD: {best_intent}")
|
||||||
return best_intent
|
return best_intent
|
||||||
|
|
||||||
# 2. SLOW PATH: LLM Router
|
# 2. SLOW PATH
|
||||||
if settings.get("llm_fallback_enabled", False):
|
if settings.get("llm_fallback_enabled", False):
|
||||||
router_prompt_template = settings.get("llm_router_prompt", "")
|
router_prompt_template = settings.get("llm_router_prompt", "")
|
||||||
if router_prompt_template:
|
if router_prompt_template:
|
||||||
prompt = router_prompt_template.replace("{query}", query)
|
prompt = router_prompt_template.replace("{query}", query)
|
||||||
logger.info("Keywords failed. Asking LLM for Intent...")
|
logger.info("Keywords failed. Asking LLM for Intent...")
|
||||||
|
|
||||||
# Kurzer Raw Call
|
# Router braucht keinen System-Prompt, nur den Classifier-Prompt
|
||||||
llm_decision = await llm.generate_raw_response(prompt)
|
llm_decision = await llm.generate_raw_response(prompt)
|
||||||
|
|
||||||
# Cleaning
|
|
||||||
llm_decision = llm_decision.strip().upper()
|
llm_decision = llm_decision.strip().upper()
|
||||||
if ":" in llm_decision:
|
if ":" in llm_decision:
|
||||||
llm_decision = llm_decision.split(":")[-1].strip()
|
llm_decision = llm_decision.split(":")[-1].strip()
|
||||||
|
|
||||||
# Validierung: Nur bekannte Intents zulassen
|
# Satzzeichen entfernen für sauberen Match
|
||||||
# Entferne Satzzeichen
|
|
||||||
llm_decision = ''.join(filter(str.isalnum, llm_decision))
|
llm_decision = ''.join(filter(str.isalnum, llm_decision))
|
||||||
|
|
||||||
if llm_decision in strategies:
|
if llm_decision in strategies:
|
||||||
|
|
@ -185,7 +178,6 @@ async def chat_endpoint(
|
||||||
)
|
)
|
||||||
strategy_result = await retriever.search(strategy_req)
|
strategy_result = await retriever.search(strategy_req)
|
||||||
|
|
||||||
# Merge
|
|
||||||
existing_ids = {h.node_id for h in hits}
|
existing_ids = {h.node_id for h in hits}
|
||||||
for strat_hit in strategy_result.results:
|
for strat_hit in strategy_result.results:
|
||||||
if strat_hit.node_id not in existing_ids:
|
if strat_hit.node_id not in existing_ids:
|
||||||
|
|
@ -198,27 +190,18 @@ async def chat_endpoint(
|
||||||
context_str = _build_enriched_context(hits)
|
context_str = _build_enriched_context(hits)
|
||||||
|
|
||||||
# 5. Generation
|
# 5. Generation
|
||||||
# Wir laden das Template aus dem Service (da dort die prompts.yaml geladen ist)
|
|
||||||
template = llm.prompts.get(prompt_key, "{context_str}\n\n{query}")
|
template = llm.prompts.get(prompt_key, "{context_str}\n\n{query}")
|
||||||
system_prompt = llm.prompts.get("system_prompt", "")
|
system_prompt = llm.prompts.get("system_prompt", "")
|
||||||
|
|
||||||
if prepend_instr:
|
if prepend_instr:
|
||||||
context_str = f"{prepend_instr}\n\n{context_str}"
|
context_str = f"{prepend_instr}\n\n{context_str}"
|
||||||
|
|
||||||
# Manuelles Bauen des finalen Prompts für volle Kontrolle
|
|
||||||
final_prompt = template.replace("{context_str}", context_str).replace("{query}", request.message)
|
final_prompt = template.replace("{context_str}", context_str).replace("{query}", request.message)
|
||||||
|
|
||||||
# Aufruf via Raw Response (da wir den Prompt schon fertig haben)
|
|
||||||
# Wir müssen den System-Prompt manuell mitgeben?
|
|
||||||
# generate_raw_response in llm_service unterstützt aktuell kein 'system'.
|
|
||||||
# -> Wir erweitern generate_raw_response oder nutzen einen Hack: System + Prompt.
|
|
||||||
|
|
||||||
# SAUBERER WEG: Wir bauen den Payload für Ollama hier manuell zusammen und rufen eine generische Methode.
|
|
||||||
# Da LLMService.generate_raw_response keine System-Msg nimmt, packen wir sie davor.
|
|
||||||
full_text_prompt = f"{system_prompt}\n\n{final_prompt}"
|
|
||||||
|
|
||||||
logger.info(f"[{query_id}] Sending to LLM (Intent: {intent}, Template: {prompt_key})...")
|
logger.info(f"[{query_id}] Sending to LLM (Intent: {intent}, Template: {prompt_key})...")
|
||||||
answer_text = await llm.generate_raw_response(full_text_prompt)
|
|
||||||
|
# FIX: System-Prompt separat übergeben!
|
||||||
|
answer_text = await llm.generate_raw_response(prompt=final_prompt, system=system_prompt)
|
||||||
|
|
||||||
duration_ms = int((time.time() - start_time) * 1000)
|
duration_ms = int((time.time() - start_time) * 1000)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"""
|
"""
|
||||||
app/services/llm_service.py — LLM Client (Ollama)
|
app/services/llm_service.py — LLM Client (Ollama)
|
||||||
Version: 0.2.0 (WP-06 Hybrid Router Support)
|
Version: 0.2.1 (Fix: System Prompt Handling for Phi-3)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
@ -17,7 +17,6 @@ class LLMService:
|
||||||
self.settings = get_settings()
|
self.settings = get_settings()
|
||||||
self.prompts = self._load_prompts()
|
self.prompts = self._load_prompts()
|
||||||
|
|
||||||
# Timeout aus Config nutzen (Default 120s)
|
|
||||||
self.client = httpx.AsyncClient(
|
self.client = httpx.AsyncClient(
|
||||||
base_url=self.settings.OLLAMA_URL,
|
base_url=self.settings.OLLAMA_URL,
|
||||||
timeout=self.settings.LLM_TIMEOUT
|
timeout=self.settings.LLM_TIMEOUT
|
||||||
|
|
@ -34,21 +33,27 @@ class LLMService:
|
||||||
logger.error(f"Failed to load prompts: {e}")
|
logger.error(f"Failed to load prompts: {e}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
async def generate_raw_response(self, prompt: str) -> str:
|
async def generate_raw_response(self, prompt: str, system: str = None) -> str:
|
||||||
"""
|
"""
|
||||||
NEU: Führt einen direkten LLM Call ohne RAG-Template aus.
|
Führt einen LLM Call aus.
|
||||||
Wird vom Router für die Antwortgenerierung genutzt.
|
Unterstützt nun explizite System-Prompts für sauberes Templating.
|
||||||
"""
|
"""
|
||||||
payload = {
|
payload = {
|
||||||
"model": self.settings.LLM_MODEL,
|
"model": self.settings.LLM_MODEL,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {
|
"options": {
|
||||||
"temperature": 0.0,
|
# Temperature etwas höher für Empathie, niedriger für Code?
|
||||||
"num_ctx": 512
|
# Wir lassen es auf Standard, oder steuern es später via Config.
|
||||||
|
"temperature": 0.7,
|
||||||
|
"num_ctx": 2048
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# WICHTIG: System-Prompt separat übergeben, damit Ollama formatiert
|
||||||
|
if system:
|
||||||
|
payload["system"] = system
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await self.client.post("/api/generate", json=payload)
|
response = await self.client.post("/api/generate", json=payload)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
|
|
@ -63,14 +68,13 @@ class LLMService:
|
||||||
return "Interner LLM Fehler."
|
return "Interner LLM Fehler."
|
||||||
|
|
||||||
async def generate_rag_response(self, query: str, context_str: str) -> str:
|
async def generate_rag_response(self, query: str, context_str: str) -> str:
|
||||||
"""Legacy Support / Fallback"""
|
"""Legacy Support"""
|
||||||
system_prompt = self.prompts.get("system_prompt", "")
|
system_prompt = self.prompts.get("system_prompt", "")
|
||||||
rag_template = self.prompts.get("rag_template", "{context_str}\n\n{query}")
|
rag_template = self.prompts.get("rag_template", "{context_str}\n\n{query}")
|
||||||
final_prompt = rag_template.format(context_str=context_str, query=query)
|
final_prompt = rag_template.format(context_str=context_str, query=query)
|
||||||
|
|
||||||
# Wir nutzen intern nun auch raw_response, um Code zu sparen
|
# Leite an die neue Methode weiter
|
||||||
full_prompt = f"{system_prompt}\n\n{final_prompt}"
|
return await self.generate_raw_response(final_prompt, system=system_prompt)
|
||||||
return await self.generate_raw_response(full_prompt)
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self.client.aclose()
|
await self.client.aclose()
|
||||||
Loading…
Reference in New Issue
Block a user