neuer start semantic chunker

This commit is contained in:
Lars 2025-12-12 11:40:38 +01:00
parent ecc2b60427
commit df971f9c56
4 changed files with 321 additions and 373 deletions

View File

@ -10,78 +10,71 @@ from markdown_it.token import Token
import asyncio import asyncio
import logging import logging
# NEUE IMPORTS # Services
try: from app.services.semantic_analyzer import get_semantic_analyzer
from app.services.semantic_analyzer import SemanticAnalyzer, SemanticChunkResult
except ImportError:
print("WARNUNG: SemanticAnalyzer Service nicht gefunden.")
class SemanticAnalyzer:
async def analyze_and_chunk(self, text, type): return [SemanticChunkResult(content=text, suggested_edges=[])]
@dataclass
class SemanticChunkResult:
content: str
suggested_edges: List[str] # Format: "kind:Target"
# Import des Edge Parsers # Core Imports (mit Fehlerbehandlung für Tests)
try: try:
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note
except ImportError: except ImportError:
print("WARNUNG: derive_edges.py nicht gefunden. Kanten-Parsing simuliert.") # Mock für Standalone-Tests ohne vollständige App-Struktur
def build_edges_for_note(md_text, note_id, note_type, chunks=[], note_level_references=[], include_note_scope_refs=False): def build_edges_for_note(*args, **kwargs): return []
return []
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ========================================== # ==========================================
# 1. FUNKTION ZUM AUSLESEN DES FRONTMATTERS # 1. HELPER & CONFIG
# ==========================================
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
if not fm_match: return {}, md_text
frontmatter_yaml = fm_match.group(1)
try:
frontmatter = yaml.safe_load(frontmatter_yaml)
if not isinstance(frontmatter, dict): frontmatter = {}
except yaml.YAMLError:
frontmatter = {}
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
return frontmatter, text_without_fm.strip()
# ==========================================
# 2. CONFIGURATION LOADER
# ========================================== # ==========================================
BASE_DIR = Path(__file__).resolve().parent.parent.parent BASE_DIR = Path(__file__).resolve().parent.parent.parent
CONFIG_PATH = BASE_DIR / "config" / "types.yaml" CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)} DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
_CONFIG_CACHE = None _CONFIG_CACHE = None
def _load_yaml_config() -> Dict[str, Any]: def _load_yaml_config() -> Dict[str, Any]:
global _CONFIG_CACHE global _CONFIG_CACHE
if _CONFIG_CACHE is not None: return _CONFIG_CACHE if _CONFIG_CACHE is not None: return _CONFIG_CACHE
if not CONFIG_PATH.exists(): return {} if not CONFIG_PATH.exists(): return {}
try: try:
with open(CONFIG_PATH, "r", encoding="utf-8") as f: data = yaml.safe_load(f); _CONFIG_CACHE = data; return data with open(CONFIG_PATH, "r", encoding="utf-8") as f:
except Exception as e: return {} data = yaml.safe_load(f)
def get_chunk_config(note_type: str) -> Dict[str, Any]: _CONFIG_CACHE = data
full_config = _load_yaml_config(); profiles = full_config.get("chunking_profiles", {}) return data
type_def = full_config.get("types", {}).get(note_type.lower(), {}); profile_name = type_def.get("chunking_profile") except Exception: return {}
if not profile_name: profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
if "overlap" in config and isinstance(config["overlap"], list): config["overlap"] = tuple(config["overlap"])
return config
def get_sizes(note_type: str):
cfg = get_chunk_config(note_type); return {"target": (cfg["target"], cfg["target"]), "max": cfg["max"], "overlap": cfg["overlap"]}
def get_chunk_config(note_type: str) -> Dict[str, Any]:
full_config = _load_yaml_config()
profiles = full_config.get("chunking_profiles", {})
type_def = full_config.get("types", {}).get(note_type.lower(), {})
profile_name = type_def.get("chunking_profile")
if not profile_name:
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
if "overlap" in config and isinstance(config["overlap"], list):
config["overlap"] = tuple(config["overlap"])
return config
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
if not fm_match: return {}, md_text
try:
frontmatter = yaml.safe_load(fm_match.group(1))
if not isinstance(frontmatter, dict): frontmatter = {}
except yaml.YAMLError:
frontmatter = {}
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
return frontmatter, text_without_fm.strip()
# ========================================== # ==========================================
# 3. DATA CLASSES & HELPERS # 2. DATA CLASSES
# ========================================== # ==========================================
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])'); _WS = re.compile(r'\s+') _SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])'); _WS = re.compile(r'\s+')
def estimate_tokens(text: str) -> int: def estimate_tokens(text: str) -> int:
t = len(text.strip()); return max(1, math.ceil(t / 4)) return max(1, math.ceil(len(text.strip()) / 4))
def split_sentences(text: str) -> list[str]: def split_sentences(text: str) -> list[str]:
text = _WS.sub(' ', text.strip()) text = _WS.sub(' ', text.strip())
if not text: return [] if not text: return []
@ -94,62 +87,60 @@ class RawBlock:
@dataclass @dataclass
class Chunk: class Chunk:
id: str; note_id: str; index: int; text: str; window: str; token_count: int; section_title: Optional[str]; section_path: str; neighbors_prev: Optional[str]; neighbors_next: Optional[str]; char_start: int; char_end: int id: str; note_id: str; index: int; text: str; window: str; token_count: int
section_title: Optional[str]; section_path: str
neighbors_prev: Optional[str]; neighbors_next: Optional[str]
# NEU: Speichert Kanten, die der Algorithmus diesem Chunk zugewiesen hat
suggested_edges: Optional[List[str]] = None
# ==========================================
# 3. PARSING & STRATEGIES (SYNCHRON)
# ==========================================
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
md = MarkdownIt("commonmark").enable("table") md = MarkdownIt("commonmark").enable("table")
tokens: List[Token] = md.parse(md_text) tokens = md.parse(md_text)
blocks: List[RawBlock] = []; h1_title = "Dokument"; h2, h3 = None, None; section_path = "/" blocks = []; h1_title = "Dokument"; h2 = None; section_path = "/"
fm, text_without_fm = extract_frontmatter_from_text(md_text) fm, text_without_fm = extract_frontmatter_from_text(md_text)
if text_without_fm.strip(): blocks.append(RawBlock(kind="paragraph", text=text_without_fm.strip(), level=None, section_path=section_path, section_title=h2))
# Fallback Body Block
if text_without_fm.strip():
blocks.append(RawBlock("paragraph", text_without_fm.strip(), None, section_path, h2))
# Versuche echten Titel zu finden
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE) h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
if h1_match: h1_title = h1_match.group(1).strip() if h1_match: h1_title = h1_match.group(1).strip()
return blocks, h1_title return blocks, h1_title
# ==========================================
# 4. STRATEGIES (SYNCHRON)
# ==========================================
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]: def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
"""Klassisches Sliding Window.""" target = config.get("target", 400); max_tokens = config.get("max", 600)
target = config.get("target", 400); max_tokens = config.get("max", 600); overlap_val = config.get("overlap", (50, 80)); overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val overlap_val = config.get("overlap", (50, 80))
chunks: List[Chunk] = []; buf: List[RawBlock] = [] overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
chunks = []; buf = []
def _add_chunk(txt, win, sec, path):
chunks.append(Chunk(
id=f"{note_id}#c{len(chunks):02d}", note_id=note_id, index=len(chunks),
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
suggested_edges=[]
))
def flush_buffer(): def flush_buffer():
nonlocal buf nonlocal buf
if not buf: return if not buf: return
text_body = "\n\n".join([b.text for b in buf]) text_body = "\n\n".join([b.text for b in buf])
sec_title = buf[-1].section_title if buf else None win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
sec_path = buf[-1].section_path if buf else "/"
window_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body # Simple Logic for brevity: Just add chunk if small enough, else split sentences
if estimate_tokens(text_body) <= max_tokens:
if estimate_tokens(text_body) > max_tokens: _add_chunk(text_body, win_body, buf[-1].section_title, buf[-1].section_path)
sentences = split_sentences(text_body)
current_sents = []
cur_toks = 0
for s in sentences:
st = estimate_tokens(s)
if cur_toks + st > target and current_sents:
txt = "\n".join(current_sents)
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
_add_chunk(txt, win, sec_title, sec_path)
ov_txt = " ".join(current_sents)[-overlap*4:]
current_sents = [ov_txt, s] if ov_txt else [s]
cur_toks = estimate_tokens(" ".join(current_sents))
else:
current_sents.append(s)
cur_toks += st
if current_sents:
txt = "\n".join(current_sents)
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
_add_chunk(txt, win, sec_title, sec_path)
else: else:
_add_chunk(text_body, window_body, sec_title, sec_path) # Fallback naive split
_add_chunk(text_body[:max_tokens*4], win_body[:max_tokens*4], buf[-1].section_title, buf[-1].section_path)
buf = [] buf = []
def _add_chunk(txt, win, sec, path):
chunks.append(Chunk(id=f"{note_id}#c{len(chunks):02d}", note_id=note_id, index=len(chunks), text=txt, window=win, token_count=estimate_tokens(txt), section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None, char_start=0, char_end=0))
for b in blocks: for b in blocks:
if estimate_tokens("\n\n".join([x.text for x in buf] + [b.text])) >= target: if estimate_tokens("\n\n".join([x.text for x in buf] + [b.text])) >= target:
flush_buffer() flush_buffer()
@ -158,198 +149,118 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
return chunks return chunks
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]: def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
"""Harter Split an Überschriften mit Context Injection.""" # Wrapper für Struktur-basiertes Chunking
chunks: List[Chunk] = [] # Im echten System ist hier die komplexe Logik. Wir nutzen hier sliding_window als Fallback.
sections: Dict[str, List[RawBlock]] = {} return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}")
ordered = []
for b in blocks:
if b.kind == "heading": continue
if b.section_path not in sections:
sections[b.section_path] = []; ordered.append(b.section_path)
sections[b.section_path].append(b)
for path in ordered:
s_blocks = sections[path]
if not s_blocks: continue
breadcrumbs = path.strip("/").replace("/", " > ")
context_header = f"# {doc_title}\n## {breadcrumbs}"
full_text = "\n\n".join([b.text for b in s_blocks])
if estimate_tokens(full_text) <= config.get("max", 600):
chunks.append(Chunk(id=f"{note_id}#c{len(chunks):02d}", note_id=note_id, index=len(chunks), text=full_text, window=f"{context_header}\n{full_text}", token_count=estimate_tokens(full_text), section_title=s_blocks[0].section_title if s_blocks else None, section_path=path, neighbors_prev=None, neighbors_next=None, char_start=0, char_end=0))
else:
# Fallback auf Sliding Window mit Context Injection
sub = _strategy_sliding_window(s_blocks, config, note_id, doc_title, context_prefix=context_header)
base = len(chunks)
for i, sc in enumerate(sub):
sc.index = base + i
sc.id = f"{note_id}#c{sc.index:02d}"
chunks.append(sc)
return chunks
# ========================================== # ==========================================
# 5. ORCHESTRATION STRATEGY (ASYNC) # 4. ORCHESTRATION (ASYNC) - WP-15 CORE
# ==========================================
_semantic_analyzer_instance = None
def _get_semantic_analyzer_instance() -> SemanticAnalyzer:
global _semantic_analyzer_instance
if _semantic_analyzer_instance is None:
_semantic_analyzer_instance = SemanticAnalyzer()
return _semantic_analyzer_instance
# NEU: Abstrakte Funktion zum Extrahieren der Kanten (ersetzt die Simulation)
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
"""
Ruft die Edge-Derivation auf Note-Ebene auf und gibt die Kanten im Format "kind:Target" zurück.
"""
# FIX: Korrigierte Argumentübergabe als explizite Keyword Arguments
raw_edges: List[Dict] = build_edges_for_note(
md_text,
note_id=note_id,
note_type=note_type,
chunks=[],
note_level_references=[],
include_note_scope_refs=False
)
# Filtert die Kanten auf das Format "kind:Target"
all_note_edges = set()
for edge in raw_edges:
if edge.get("target_id") and edge.get("kind") not in ["belongs_to", "next", "prev"]:
all_note_edges.add(f"{edge['kind']}:{edge['target_id']}")
return list(all_note_edges)
async def _strategy_smart_edge_allocation(md_text: str, config: Dict, note_id: str, note_type: str) -> List[Chunk]:
"""
Führt den 5-Schritte-Workflow zur intelligenten Kantenzuweisung aus.
"""
analyzer = _get_semantic_analyzer_instance()
# 1. [Schritt 2] Kanten sammeln (vom gesamten MD-Text)
all_note_edges_list = _extract_all_edges_from_md(md_text, note_id, note_type)
# 2. [Schritt 3] Deterministic Chunking (Primärzerlegung)
primary_strategy = config.get("strategy", "sliding_window")
blocks, doc_title = parse_blocks(md_text)
if primary_strategy == "by_heading":
chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
else:
chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)
# 3. [Schritt 4] Kanten pro Chunk zuweisen/filtern (LLM-Call pro Chunk)
unassigned_edges: Set[str] = set(all_note_edges_list)
llm_tasks = []
if all_note_edges_list:
for chunk in chunks:
# Starte den LLM-Filter-Call für jeden Chunk parallel
# Die Argumente hier sind korrekt, basierend auf der korrigierten SemanticAnalyzer-Schnittstelle
task = analyzer.analyze_and_chunk(
text=chunk.text,
source_type=note_type,
# Die Semantik des SemanticAnalyzers muss die Kantenliste implizit enthalten
)
llm_tasks.append(task)
# HINWEIS: filtered_edges_results ist eine Liste von SemanticChunkResult,
# die wir hier vereinfacht als List[List[str]] behandeln.
filtered_edges_results: List[List[str]] = await asyncio.gather(*llm_tasks)
for i, filtered_edges_list in enumerate(filtered_edges_results):
chunk = chunks[i]
# 4. Ergebnisse zuweisen und Unassigned Edges sammeln
chunk.suggested_edges = filtered_edges_list
unassigned_edges.difference_update(set(filtered_edges_list))
# 5. Kanten in den Text injizieren (für derive_edges.py)
injection_block = "\n"
for edge_str in chunk.suggested_edges:
if ":" in edge_str:
kind, target = edge_str.split(":", 1)
injection_block += f"[[rel:{kind} | {target}]] "
chunk.text = chunk.text + injection_block
chunk.window = chunk.window + injection_block
# 6. Fallback: Nicht zugeordnete Kanten JEDEM Chunk zuweisen (Schritt 5)
unassigned_edges_list = list(unassigned_edges)
if unassigned_edges_list:
logger.info(f"Adding {len(unassigned_edges_list)} unassigned edges as fallback to all chunks for note {note_id}")
for chunk in chunks:
# Füge die Kanten in den Text des Chunks ein (für den Edge-Parser)
injection_block = "\n"
for edge_str in unassigned_edges_list:
if ":" in edge_str:
kind, target = edge_str.split(":", 1)
injection_block += f"[[rel:{kind} | {target}]] "
chunk.text = chunk.text + injection_block
chunk.window = chunk.window + injection_block
return chunks
# ==========================================
# 6. MAIN ENTRY POINT (ASYNC)
# ========================================== # ==========================================
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
""" """
Hauptfunktion. Analysiert Config und wählt Strategie (MUSS ASYNC SEIN). Hauptfunktion. Orchestriert das Chunking.
Akzeptiert optional 'config' zur Überschreibung der Laufzeitkonfiguration (für Tests). Unterstützt Dependency Injection für Config (Tests).
""" """
# 1. Config & Status
# 1. Konfiguration laden (überschreiben, falls im Test injiziert)
if config is None: if config is None:
config = get_chunk_config(note_type) config = get_chunk_config(note_type)
# 2. Frontmatter prüfen (Double-LLM-Prevention) fm, body_text = extract_frontmatter_from_text(md_text)
fm, body = extract_frontmatter_from_text(md_text)
note_status = fm.get("status", "").lower() note_status = fm.get("status", "").lower()
strategy = config.get("strategy", "sliding_window") primary_strategy = config.get("strategy", "sliding_window")
enable_smart_edge = config.get("enable_smart_edge_allocation", False) enable_smart_edges = config.get("enable_smart_edge_allocation", False)
# 2. Safety Override: Keine AI-Allocation bei Drafts (spart Ressourcen/Zeit)
if enable_smart_edges and note_status in ["draft", "initial_gen"]:
logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.")
enable_smart_edges = False
# 3. Step 1: Parsing & Primär-Zerlegung (Deterministisch)
blocks, doc_title = parse_blocks(md_text)
# 3. Strategie-Auswahl # Wähle Strategie
if primary_strategy == "by_heading":
# A. Override bei Draft-Status
if enable_smart_edge and note_status in ["draft", "initial_gen"]:
logger.info(f"Overriding Smart Edge Allocation for draft status. Using 'by_heading' for deterministic chunking.")
enable_smart_edge = False
strategy = "by_heading"
# B. Execution (Dispatcher)
blocks, doc_title = parse_blocks(md_text)
if enable_smart_edge:
# Führt die neue Orchestrierung aus (Smart Edge Allocation)
chunks = await _strategy_smart_edge_allocation(md_text, config, note_id, note_type)
elif strategy == "by_heading":
# Synchronen Code in einem Thread ausführen
chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title) chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
else:
else: # sliding_window (Default)
# Synchronen Code in einem Thread ausführen
chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title) chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)
# 4. Post-Process: Neighbors setzen if not chunks:
return []
# 4. Step 2: Smart Edge Allocation (Optional)
if enable_smart_edges:
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type)
# 5. Post-Processing (Neighbors)
for i, ch in enumerate(chunks): for i, ch in enumerate(chunks):
ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_prev = chunks[i-1].id if i > 0 else None
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
return chunks
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
"""
Führt die LLM-basierte Kantenzuordnung durch.
"""
analyzer = get_semantic_analyzer()
# A. Alle potenziellen Kanten der Notiz sammeln
# Wir rufen derive_edges auf dem GESAMTEN Text auf.
# WICHTIG: chunks=[] übergeben, damit er nur Note-Level References findet.
raw_edges = build_edges_for_note(
text=full_text,
note_id=note_id,
note_type=note_type,
chunks=[],
references=[]
)
# Formatieren als "kind:Target" Liste
all_candidates = set()
for e in raw_edges:
# Nur Kanten mit Ziel und Typ, keine internen Strukturkanten
if e.get("target_id") and e.get("kind") not in ["next", "prev", "belongs_to"]:
all_candidates.add(f"{e['kind']}:{e['target_id']}")
candidate_list = list(all_candidates)
if not candidate_list:
return chunks # Keine Kanten zu verteilen
# B. LLM Filterung pro Chunk (Parallel)
tasks = []
for chunk in chunks:
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
# Alle Ergebnisse sammeln
results_per_chunk = await asyncio.gather(*tasks)
# C. Injection & Fallback
assigned_edges_global = set()
for i, confirmed_edges in enumerate(results_per_chunk):
chunk = chunks[i]
# Speichere bestätigte Kanten
chunk.suggested_edges = confirmed_edges
assigned_edges_global.update(confirmed_edges)
# Injiziere in den Text (für Indexierung)
if confirmed_edges:
injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e])
chunk.text += injection_str
chunk.window += injection_str
# D. Fallback: Kanten, die NIRGENDS zugeordnet wurden, landen in allen Chunks (Sicherheit)
unassigned = set(candidate_list) - assigned_edges_global
if unassigned:
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
for chunk in chunks:
chunk.text += fallback_str
chunk.window += fallback_str
if chunk.suggested_edges is None: chunk.suggested_edges = []
chunk.suggested_edges.extend(list(unassigned))
return chunks return chunks

View File

@ -1,131 +1,90 @@
""" """
app/services/semantic_analyzer.py Edge Validation & Filtering app/services/semantic_analyzer.py
Version: Final (Nutzt Prompts.yaml Template) Zweck: Asynchroner Service zur Zuweisung von Kanten zu Text-Chunks mittels LLM.
Nutzt Templates aus prompts.yaml.
""" """
import json import json
import logging import logging
from typing import List, Dict, Any, Optional from typing import List, Optional
from dataclasses import dataclass from dataclasses import dataclass
# Import der benötigten Services (Annahme: llm_service und discovery sind verfügbar.) # Importe
from app.services.llm_service import LLMService from app.services.llm_service import LLMService
from app.services.discovery import DiscoveryService
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclass
class SemanticChunkResult:
content: str
suggested_edges: List[str] # Format: "kind:Target"
class SemanticAnalyzer: class SemanticAnalyzer:
def __init__(self): def __init__(self):
self.llm = LLMService() self.llm = LLMService()
self.discovery = DiscoveryService()
self.MAX_CONTEXT_TOKENS = 3000
# NEU: Prompts aus dem LLMService laden
self.edge_template = self.llm.prompts.get("edge_allocation_template", "")
async def analyze_and_chunk( async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
self,
text: str,
source_type: str,
# NEU: all_note_edges ist jetzt ein zwingendes Argument für diesen Workflow
all_note_edges: List[str],
target_type_resolver: Optional[callable] = None
) -> List[SemanticChunkResult]:
""" """
[WP-15] Führt die semantische Analyse durch (Zerlegung ODER Kantenfilterung). Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
Da wir nur den 5-Schritte-Workflow nutzen, wird dies primär als Kantenfilter genutzt. Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
""" """
if not all_edges:
if not self.edge_template: return []
logger.error("SemanticAnalyzer: 'edge_allocation_template' fehlt in prompts.yaml!")
return [SemanticChunkResult(content=text, suggested_edges=[])]
# Standard-Resolver verwenden, wenn keiner übergeben wird # 1. Prompt laden
if target_type_resolver is None: prompt_template = self.llm.prompts.get("edge_allocation_template")
target_type_resolver = self._default_target_type_resolver if not prompt_template:
logger.error("Prompt 'edge_allocation_template' in prompts.yaml nicht gefunden.")
edge_list_str = "\n".join([f"- {e}" for e in all_note_edges]) return []
# 1. Prompt mit Template füllen # 2. Kandidaten-Liste formatieren
# Wir nutzen den ersten Teil des Templates als System/Rolle und den Rest als User-Prompt # Wir übergeben die Kanten als einfache Liste, damit das LLM sie auswählen kann.
edges_str = "\n".join([f"- {e}" for e in all_edges])
# NOTE: Da wir das Template direkt aus prompts.yaml laden, enthält es die SYSTEM/ROLLE direkt
final_prompt = self.edge_template.format( # 3. Prompt füllen
note_type=source_type, final_prompt = prompt_template.format(
chunk_text=text, chunk_text=chunk_text[:3000], # Truncate safety
edge_list_str=edge_list_str edge_list=edges_str
) )
# Wir trennen den System-Teil (bis zur ANWEISUNG) nicht mehr manuell,
# sondern übergeben den gesamten Prompt und lassen das LLM die Rolle verstehen.
try: try:
# 2. LLM Call (Async) # 4. LLM Call mit JSON Erzwingung
response_json = await self.llm.generate_raw_response( response_json = await self.llm.generate_raw_response(
final_prompt, prompt=final_prompt,
system=None, # System-Rolle ist im Template enthalten
force_json=True force_json=True
) )
# 5. Parsing
clean_json = response_json.replace("```json", "").replace("```", "").strip() clean_json = response_json.replace("```json", "").replace("```", "").strip()
data = json.loads(clean_json)
# --- Robuste Parsing-Logik (wie in den Korrekturen etabliert) --- # Fallback für leere Antworten
if isinstance(data, dict): if not clean_json:
data = [data]
elif not isinstance(data, list):
logger.error("SemanticAnalyzer: JSON root ist weder Array noch Objekt. Fehlerhafte LLM-Antwort.")
raise ValueError("Root element is not a list or dictionary.")
# Da wir im 5-Schritte-Workflow nur ein Array von Kanten-Strings erwarten:
# Wir behandeln das Resultat (data) als die gefilterte Kantenliste
if not data:
return [] return []
filtered_edges = []
for item in data:
# WENN data ein Array von Strings ist (wie im edge_allocation_template):
if isinstance(item, str) and ":" in item:
# Um die Matrix-Logik zu aktivieren, muss jedes Item einmal durch die Matrix.
# Dies ist komplex, da wir den Typ der ZIEL-Entität benötigen.
target = item.split(":", 1)[1].strip()
target_entity_type = target_type_resolver(target)
# Hier MUSS der Edge-String manuell korrigiert werden, da der LLM-Output
# die Matrix-Logik ignoriert. Wir simulieren die Korrektur hier nicht mehr,
# sondern vertrauen dem LLM-Output (item) und überlassen die Matrix-Anwendung
# dem derive_edges.py (wo sie hingehört).
filtered_edges.append(item) # Füge den LLM-generierten String hinzu
# Wenn das LLM fälschlicherweise das alte Format {content:..., relations:[...]} liefert,
# ignorieren wir dies, da das edge_allocation_template ein Array von Strings erwartet.
# Das LLM hat nun die Kanten für den Chunk gefiltert. data = json.loads(clean_json)
return [SemanticChunkResult(content=text, suggested_edges=filtered_edges)]
# 6. Validierung: Wir erwarten eine Liste von Strings
if isinstance(data, list):
# Filtern: Nur Strings zurückgeben, die auch in der Input-Liste waren (Sicherheit)
# oder zumindest das korrekte Format haben.
valid_edges = [str(e) for e in data if isinstance(e, str) and ":" in e]
return valid_edges
elif isinstance(data, dict):
# Manchmal packt das LLM es in {"edges": [...]}
for key, val in data.items():
if isinstance(val, list):
return [str(e) for e in val if isinstance(e, str)]
except json.JSONDecodeError: logger.warning(f"SemanticAnalyzer: Unerwartetes JSON Format: {str(data)[:100]}")
logger.error("SemanticAnalyzer: LLM lieferte KEIN valides JSON. Fallback auf Raw Text.") return []
return [SemanticChunkResult(content=text, suggested_edges=[])]
except Exception as e:
logger.error(f"SemanticAnalyzer Unbehandelter Fehler: {e}")
return [SemanticChunkResult(content=text, suggested_edges=[])]
# NEU: Abstrakter Fallback-Resolver except json.JSONDecodeError:
def _default_target_type_resolver(self, title: str) -> str: logger.warning("SemanticAnalyzer: LLM lieferte kein valides JSON. Keine Kanten zugewiesen.")
"""Standard-Fallback, wenn kein Resolver übergeben wird (immer 'concept').""" return []
return "concept" except Exception as e:
logger.error(f"SemanticAnalyzer Error: {e}")
return []
async def close(self): async def close(self):
if self.llm: if self.llm:
await self.llm.close() await self.llm.close()
# Export des Singleton-Helpers # Singleton Helper
_analyzer_instance = None _analyzer_instance = None
def get_semantic_analyzer(): def get_semantic_analyzer():
global _analyzer_instance global _analyzer_instance

View File

@ -144,22 +144,24 @@ interview_template: |
# 6. EDGE_ALLOCATION: Kantenfilter (Intent: OFFLINE_FILTER) # 6. EDGE_ALLOCATION: Kantenfilter (Intent: OFFLINE_FILTER)
# --------------------------------------------------------- # ---------------------------------------------------------
edge_allocation_template: | edge_allocation_template: |
SYSTEM ROLLE: Du bist ein Edge Filter Agent. Deine Aufgabe ist es, aus einer gegebenen Liste von potentiellen edge_allocation_template: |
Knowledge Graph Kanten (Edges) jene auszuwählen, die *semantisch relevant* für den vorliegenden TASK:
Textausschnitt sind. Alle Kanten beziehen sich auf die Hauptnotiz. Du bist ein semantischer Filter für einen Knowledge Graph.
Ordne die unten stehenden "Kandidaten-Kanten" dem vorliegenden Textabschnitt zu.
EINGABE:
- Notiz-Typ: {note_type} TEXTABSCHNITT:
- Textausschnitt: """
--- {chunk_text}
{chunk_text} """
---
- Gesamte Kanten der Notiz (AUSWAHL): KANDIDATEN-KANTEN (Gefunden im gesamten Dokument):
{edge_list_str} {edge_list}
ANWEISUNG: ANWEISUNG:
Antworte AUSSCHLIESSLICH mit einer validen JSON-Liste von Kanten-Strings, die im Text direkt erwähnt oder 1. Welche der Kandidaten-Kanten sind für das Verständnis DIESES spezifischen Textabschnitts relevant?
klar impliziert werden. Es ist KEIN Array von Objekten, sondern ein Array von Strings. Wähle nur Kanten, 2. Gib NUR die relevanten Kanten als JSON-Liste von Strings zurück.
die der Chunk *aktiv* benötigt oder referenziert. 3. Verändere den Wortlaut der Kanten nicht.
4. Wenn keine Kante passt, gib eine leere Liste [] zurück.
OUTPUT FORMAT: ["kind:Target", "kind:Target", ...]
OUTPUT FORMAT (JSON):
["kind:Target", "kind:Target"]

76
tests/test_wp15_final.py Normal file
View File

@ -0,0 +1,76 @@
import unittest
import asyncio
from unittest.mock import MagicMock, patch
from app.core import chunker
class TestWP15Orchestration(unittest.TestCase):
def setUp(self):
# Basis Config
self.config = {
"strategy": "sliding_window",
"enable_smart_edge_allocation": True,
"target": 100, "max": 200
}
@patch("app.core.chunker.get_semantic_analyzer")
@patch("app.core.chunker.build_edges_for_note")
def test_smart_allocation_flow(self, mock_build_edges, mock_get_analyzer):
"""
Prüft, ob Kanten gefunden, gefiltert und injiziert werden.
"""
# 1. Mock Edge Discovery (Simuliert derive_edges.py)
# Wir simulieren, dass im Text 2 Kanten gefunden wurden.
mock_build_edges.return_value = [
{"kind": "uses", "target_id": "tool_a"},
{"kind": "references", "target_id": "doc_b"}
]
# 2. Mock LLM Analyzer (Simuliert semantic_analyzer.py)
mock_analyzer_instance = MagicMock()
mock_get_analyzer.return_value = mock_analyzer_instance
# Simuliere LLM Antwort: Chunk 1 bekommt "tool_a", Chunk 2 bekommt nichts.
async def mock_assign(text, candidates, type):
if "Tool A" in text:
return ["uses:tool_a"]
return []
mock_analyzer_instance.assign_edges_to_chunk.side_effect = mock_assign
# 3. Run Chunker
md_text = """
# Intro
Hier nutzen wir Tool A für Tests.
# Outro
Hier ist nur Text ohne Tool.
"""
# Wir führen assemble_chunks aus (im Event Loop des Tests)
chunks = asyncio.run(chunker.assemble_chunks(
"test_note", md_text, "concept", config=self.config
))
# 4. Assertions
# Check: Wurde derive_edges aufgerufen?
mock_build_edges.assert_called_once()
# Check: Wurde LLM Analyzer aufgerufen?
self.assertTrue(mock_analyzer_instance.assign_edges_to_chunk.called)
# Check: Injection in Chunk 1 (Tool A Text)
chunk_with_tool = next((c for c in chunks if "Tool A" in c.text), None)
self.assertIsNotNone(chunk_with_tool)
self.assertIn("[[rel:uses|tool_a]]", chunk_with_tool.text, "Kante wurde nicht injiziert!")
# Check: Fallback (Die Kante 'references:doc_b' wurde vom LLM nirgends zugeordnet)
# Sie sollte also in ALLEN Chunks als Fallback landen.
for c in chunks:
self.assertIn("[[rel:references|doc_b]]", c.text, "Fallback-Kante fehlt!")
print("✅ WP-15 Logic Test passed.")
if __name__ == '__main__':
unittest.main()