mindnet/app/services/semantic_analyzer.py

166 lines
6.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
FILE: app/services/semantic_analyzer.py
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
WP-20 Fix: Kompatibilität mit Provider-basierten Prompt-Dictionaries (Hybrid-Modus).
VERSION: 2.2.0
STATUS: Active
DEPENDENCIES: app.services.llm_service, json, logging
LAST_ANALYSIS: 2025-12-23
"""
import json
import logging
from typing import List, Optional
from dataclasses import dataclass
# Importe
from app.services.llm_service import LLMService
logger = logging.getLogger(__name__)
class SemanticAnalyzer:
def __init__(self):
self.llm = LLMService()
def _is_valid_edge_string(self, edge_str: str) -> bool:
"""
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
"""
if not isinstance(edge_str, str) or ":" not in edge_str:
return False
parts = edge_str.split(":", 1)
kind = parts[0].strip()
target = parts[1].strip()
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
if " " in kind:
return False
# Regel 2: Plausible Länge für den Typ
if len(kind) > 40 or len(kind) < 2:
return False
# Regel 3: Target darf nicht leer sein
if not target:
return False
return True
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
"""
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
WP-20 Fix: Nutzt get_prompt(), um den 'AttributeError: dict object' zu vermeiden.
"""
if not all_edges:
return []
# 1. Prompt laden via get_prompt (handelt die Provider-Kaskade automatisch ab) [WP-20 Fix]
prompt_template = self.llm.get_prompt("edge_allocation_template")
if not prompt_template or isinstance(prompt_template, dict):
logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' konnte nicht als String geladen werden. Nutze Hard-Fallback.")
prompt_template = (
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
"TEXT: {chunk_text}\n"
"KANDIDATEN: {edge_list}\n"
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
)
# 2. Kandidaten-Liste formatieren
edges_str = "\n".join([f"- {e}" for e in all_edges])
# LOG: Request Info
logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")
# 3. Prompt füllen (Hier trat der AttributeError auf, wenn prompt_template ein dict war)
try:
final_prompt = prompt_template.format(
chunk_text=chunk_text[:3500],
edge_list=edges_str
)
except Exception as format_err:
logger.error(f"❌ [SemanticAnalyzer] Format Error im Prompt-Template: {format_err}")
return []
try:
# 4. LLM Call mit Traffic Control (Background Priority)
response_json = await self.llm.generate_raw_response(
prompt=final_prompt,
force_json=True,
max_retries=5,
base_delay=5.0,
priority="background"
)
# LOG: Raw Response Preview
logger.debug(f"📥 [SemanticAnalyzer] Raw Response (Preview): {response_json[:200]}...")
# 5. Parsing & Cleaning
clean_json = response_json.replace("```json", "").replace("```", "").strip()
if not clean_json:
logger.warning("⚠️ [SemanticAnalyzer] Leere Antwort vom LLM erhalten.")
return []
try:
data = json.loads(clean_json)
except json.JSONDecodeError as json_err:
logger.error(f"❌ [SemanticAnalyzer] JSON Decode Error: {json_err}")
return []
valid_edges = []
# 6. Robuste Validierung (List vs Dict)
raw_candidates = []
if isinstance(data, list):
raw_candidates = data
elif isinstance(data, dict):
logger.info(f" [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur.")
for key, val in data.items():
if key.lower() in ["edges", "results", "kanten", "matches"] and isinstance(val, list):
raw_candidates.extend(val)
elif isinstance(val, str):
raw_candidates.append(f"{key}:{val}")
elif isinstance(val, list):
for target in val:
if isinstance(target, str):
raw_candidates.append(f"{key}:{target}")
# 7. Strict Validation Loop
for e in raw_candidates:
e_str = str(e)
if self._is_valid_edge_string(e_str):
valid_edges.append(e_str)
else:
logger.debug(f" [SemanticAnalyzer] Invalid edge format rejected: '{e_str}'")
final_result = [e for e in valid_edges if ":" in e]
if final_result:
logger.info(f"✅ [SemanticAnalyzer] Success. {len(final_result)} Kanten zugewiesen.")
else:
logger.debug(" [SemanticAnalyzer] Keine spezifischen Kanten erkannt (Empty Result).")
return final_result
except Exception as e:
logger.error(f"💥 [SemanticAnalyzer] Kritischer Fehler: {e}", exc_info=True)
return []
async def close(self):
if self.llm:
await self.llm.close()
# Singleton Helper
_analyzer_instance = None
def get_semantic_analyzer():
global _analyzer_instance
if _analyzer_instance is None:
_analyzer_instance = SemanticAnalyzer()
return _analyzer_instance