mindnet/app/services/semantic_analyzer.py
2025-12-23 21:57:50 +01:00

172 lines
6.6 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
FILE: app/services/semantic_analyzer.py
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
WP-20 Fix: Volle Kompatibilität mit der gehärteten LLMService (v3.3.2) Kaskade.
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
VERSION: 2.2.2
STATUS: Active
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging
LAST_ANALYSIS: 2025-12-23
"""
import json
import logging
from typing import List, Optional
from dataclasses import dataclass
# Importe
from app.services.llm_service import LLMService
# WP-22: Registry für Vokabular-Erzwingung
from app.services.edge_registry import registry as edge_registry
logger = logging.getLogger(__name__)
class SemanticAnalyzer:
def __init__(self):
self.llm = LLMService()
def _is_valid_edge_string(self, edge_str: str) -> bool:
"""
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
"""
if not isinstance(edge_str, str) or ":" not in edge_str:
return False
parts = edge_str.split(":", 1)
kind = parts[0].strip()
target = parts[1].strip()
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
if " " in kind:
return False
# Regel 2: Plausible Länge für den Typ
if len(kind) > 40 or len(kind) < 2:
return False
# Regel 3: Target darf nicht leer sein
if not target:
return False
return True
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
"""
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
"""
if not all_edges:
return []
# 1. Prompt laden via get_prompt (handelt die Provider-Kaskade automatisch ab)
prompt_template = self.llm.get_prompt("edge_allocation_template")
# Sicherheits-Check für die Format-Methode
if not prompt_template or isinstance(prompt_template, dict):
logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' konnte nicht als String geladen werden. Nutze Not-Fallback.")
prompt_template = (
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
"TEXT: {chunk_text}\n"
"KANDIDATEN: {edge_list}\n"
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
)
# 2. Daten für Template vorbereiten (WP-22 Integration)
# Wir laden die validen Typen, um sie dem LLM als Leitplanken zu geben
edge_registry.ensure_latest()
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
edges_str = "\n".join([f"- {e}" for e in all_edges])
# LOG: Request Info
logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")
# 3. Prompt füllen (FIX: valid_types hinzugefügt, um FormatError zu beheben)
try:
final_prompt = prompt_template.format(
chunk_text=chunk_text[:3500],
edge_list=edges_str,
valid_types=valid_types_str
)
except Exception as format_err:
logger.error(f"❌ [SemanticAnalyzer] Format Error im Prompt-Template (Fehlender Parameter): {format_err}")
return []
try:
# 4. LLM Call mit Traffic Control (Background Priority)
response_json = await self.llm.generate_raw_response(
prompt=final_prompt,
force_json=True,
max_retries=5,
base_delay=5.0,
priority="background"
)
# LOG: Raw Response Preview
logger.debug(f"📥 [SemanticAnalyzer] Raw Response (Preview): {response_json[:200]}...")
# 5. Parsing & Cleaning
clean_json = response_json.replace("```json", "").replace("```", "").strip()
if not clean_json:
logger.warning("⚠️ [SemanticAnalyzer] Leere Antwort vom LLM erhalten.")
return []
try:
data = json.loads(clean_json)
except json.JSONDecodeError as json_err:
logger.error(f"❌ [SemanticAnalyzer] JSON Decode Error: {json_err}")
return []
valid_edges = []
# 6. Robuste Validierung (List vs Dict)
raw_candidates = []
if isinstance(data, list):
raw_candidates = data
elif isinstance(data, dict):
logger.info(f" [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur.")
for key, val in data.items():
if key.lower() in ["edges", "results", "kanten", "matches"] and isinstance(val, list):
raw_candidates.extend(val)
elif isinstance(val, str):
raw_candidates.append(f"{key}:{val}")
elif isinstance(val, list):
for target in val:
if isinstance(target, str):
raw_candidates.append(f"{key}:{target}")
# 7. Strict Validation Loop
for e in raw_candidates:
e_str = str(e)
if self._is_valid_edge_string(e_str):
valid_edges.append(e_str)
else:
logger.debug(f" [SemanticAnalyzer] Invalid edge format rejected: '{e_str}'")
final_result = [e for e in valid_edges if ":" in e]
if final_result:
logger.info(f"✅ [SemanticAnalyzer] Success. {len(final_result)} Kanten zugewiesen.")
else:
logger.debug(" [SemanticAnalyzer] Keine spezifischen Kanten erkannt (Empty Result).")
return final_result
except Exception as e:
logger.error(f"💥 [SemanticAnalyzer] Kritischer Fehler: {e}", exc_info=True)
return []
async def close(self):
if self.llm:
await self.llm.close()
# Singleton Helper
_analyzer_instance = None
def get_semantic_analyzer():
global _analyzer_instance
if _analyzer_instance is None:
_analyzer_instance = SemanticAnalyzer()
return _analyzer_instance