mindnet/app/services/semantic_analyzer.py

178 lines
7.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
FILE: app/services/semantic_analyzer.py
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary).
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
FIX: Finale DoD-Härtung, Entfernung von Hardcoded Limits und optimiertes Error-Handling.
VERSION: 2.2.4
STATUS: Active
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging
"""
import json
import logging
import re
from typing import List, Optional
from dataclasses import dataclass
# Importe
from app.services.llm_service import LLMService
# WP-22: Registry für Vokabular-Erzwingung
from app.services.edge_registry import registry as edge_registry
logger = logging.getLogger(__name__)
class SemanticAnalyzer:
def __init__(self):
self.llm = LLMService()
def _is_valid_edge_string(self, edge_str: str) -> bool:
"""
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
"""
if not isinstance(edge_str, str) or ":" not in edge_str:
return False
parts = edge_str.split(":", 1)
kind = parts[0].strip()
target = parts[1].strip()
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
if " " in kind:
return False
# Regel 2: Plausible Länge für den Typ (Vermeidet Sätze als Typ)
if len(kind) > 40 or len(kind) < 2:
return False
# Regel 3: Target darf nicht leer sein
if not target:
return False
return True
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
"""
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
WP-20: Nutzt primär den konfigurierten Provider (z.B. OpenRouter).
"""
if not all_edges:
return []
# 1. Bestimmung des Providers und Modells (Dynamisch über Settings)
provider = self.llm.settings.MINDNET_LLM_PROVIDER
model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL
# 2. Prompt laden (Provider-spezifisch)
prompt_template = self.llm.get_prompt("edge_allocation_template", provider)
if not prompt_template or not isinstance(prompt_template, str):
logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' ungültig. Nutze Recovery-Template.")
prompt_template = (
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
"TEXT: {chunk_text}\n"
"KANDIDATEN: {edge_list}\n"
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
)
# 3. Daten für Template vorbereiten (Vokabular-Check)
edge_registry.ensure_latest()
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
edges_str = "\n".join([f"- {e}" for e in all_edges])
logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")
# 4. Prompt füllen mit Format-Check (Kein Shortcut)
try:
# Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster (ca. 10k Tokens max)
final_prompt = prompt_template.format(
chunk_text=chunk_text[:6000],
edge_list=edges_str,
valid_types=valid_types_str
)
except Exception as format_err:
logger.error(f"❌ [SemanticAnalyzer] Prompt Formatting failed: {format_err}")
return []
try:
# 5. LLM Call mit Background Priority & Semaphore Control
response_json = await self.llm.generate_raw_response(
prompt=final_prompt,
force_json=True,
max_retries=3,
base_delay=2.0,
priority="background",
provider=provider,
model_override=model
)
# 6. Bulletproof JSON Extraction (Analog zur Ingestion)
# Entfernt Markdown-Code-Blöcke falls vorhanden
match = re.search(r"```(?:json)?\s*(.*?)\s*```", response_json, re.DOTALL)
clean_json = match.group(1) if match else response_json
clean_json = clean_json.strip()
if not clean_json:
return []
try:
data = json.loads(clean_json)
except json.JSONDecodeError:
# Letzter Rettungsversuch: Suche nach dem ersten '[' und letzten ']'
start = clean_json.find('[')
end = clean_json.rfind(']') + 1
if start != -1 and end != 0:
try:
data = json.loads(clean_json[start:end])
except:
logger.error("❌ [SemanticAnalyzer] JSON Recovery failed.")
return []
else:
return []
# 7. Robuste Normalisierung (List vs Dict Recovery)
raw_candidates = []
if isinstance(data, list):
raw_candidates = data
elif isinstance(data, dict):
logger.info(f" [SemanticAnalyzer] LLM returned dict, trying recovery.")
for key in ["edges", "results", "kanten", "matches"]:
if key in data and isinstance(data[key], list):
raw_candidates.extend(data[key])
break
# Falls immer noch leer, nutze Schlüssel-Wert Paare als Behelf
if not raw_candidates:
for k, v in data.items():
if isinstance(v, str): raw_candidates.append(f"{k}:{v}")
elif isinstance(v, list): [raw_candidates.append(f"{k}:{i}") for i in v if isinstance(i, str)]
# 8. Strikte Validierung gegen Kanten-Format
valid_edges = []
for e in raw_candidates:
e_str = str(e).strip()
if self._is_valid_edge_string(e_str):
valid_edges.append(e_str)
else:
logger.debug(f" [SemanticAnalyzer] Rejected invalid edge format: '{e_str}'")
if valid_edges:
logger.info(f"✅ [SemanticAnalyzer] Assigned {len(valid_edges)} edges to chunk.")
return valid_edges
except Exception as e:
logger.error(f"💥 [SemanticAnalyzer] Critical error during analysis: {e}", exc_info=True)
return []
async def close(self):
if self.llm:
await self.llm.close()
# Singleton Instanziierung
_analyzer_instance = None
def get_semantic_analyzer():
global _analyzer_instance
if _analyzer_instance is None:
_analyzer_instance = SemanticAnalyzer()
return _analyzer_instance