WP15b - Initial

This commit is contained in:
Lars 2025-12-26 21:52:08 +01:00
parent d1a065fec8
commit f6b2375d65
6 changed files with 441 additions and 455 deletions

View File

@ -1,13 +1,16 @@
""" """
FILE: app/core/chunker.py FILE: app/core/chunker.py
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings). DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings).
Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer. WP-15b: Implementiert Edge-Inheritance und Candidate-Pool Vorbereitung.
FIX V3: Support für mehrzeilige Callouts und Section-Propagation. Zentralisiert die Kanten-Vorbereitung für die spätere binäre Validierung.
VERSION: 3.1.0 (Full Compatibility Merge) Bietet volle Unterstützung für Hybrid-Chunking (Strict/Soft/Safety-Net).
VERSION: 3.2.0
STATUS: Active
DEPENDENCIES: re, math, yaml, pathlib, asyncio, logging
""" """
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple, Any, Set from typing import List, Dict, Optional, Tuple, Any, Set
import re import re
import math import math
@ -17,15 +20,18 @@ import asyncio
import logging import logging
# Services # Services
from app.services.semantic_analyzer import get_semantic_analyzer # In WP-15b wird die KI-Validierung in die ingestion.py verlagert.
# Wir behalten den Import für Abwärtskompatibilität, falls Legacy-Skripte ihn benötigen.
try:
from app.services.semantic_analyzer import get_semantic_analyzer
except ImportError:
def get_semantic_analyzer(): return None
# Core Imports # Core Imports
# Wir importieren build_edges_for_note nur, um kompatibel zur Signatur zu bleiben
# oder für den Fallback.
try: try:
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note
except ImportError: except ImportError:
# Mock für Tests # Fallback für Standalone-Betrieb oder Tests
def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return [] def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -54,7 +60,7 @@ def _load_yaml_config() -> Dict[str, Any]:
def get_chunk_config(note_type: str) -> Dict[str, Any]: def get_chunk_config(note_type: str) -> Dict[str, Any]:
""" """
Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml. Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml.
Dies sichert die Kompatibilität zu WP-15 (Profile). Sichert die Kompatibilität zu WP-15 Profilen.
""" """
full_config = _load_yaml_config() full_config = _load_yaml_config()
profiles = full_config.get("chunking_profiles", {}) profiles = full_config.get("chunking_profiles", {})
@ -75,6 +81,7 @@ def get_chunk_config(note_type: str) -> Dict[str, Any]:
return config return config
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]: def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
"""Trennt YAML-Frontmatter vom eigentlichen Text."""
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL) fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
if not fm_match: return {}, md_text if not fm_match: return {}, md_text
try: try:
@ -89,12 +96,15 @@ def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
# 2. DATA CLASSES & TEXT TOOLS # 2. DATA CLASSES & TEXT TOOLS
# ========================================== # ==========================================
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])'); _WS = re.compile(r'\s+') _SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
_WS = re.compile(r'\s+')
def estimate_tokens(text: str) -> int: def estimate_tokens(text: str) -> int:
"""Grobe Schätzung der Token-Anzahl (4 Zeichen pro Token)."""
return max(1, math.ceil(len(text.strip()) / 4)) return max(1, math.ceil(len(text.strip()) / 4))
def split_sentences(text: str) -> list[str]: def split_sentences(text: str) -> list[str]:
"""Teilt Text in Sätze auf unter Berücksichtigung von Interpunktion."""
text = _WS.sub(' ', text.strip()) text = _WS.sub(' ', text.strip())
if not text: return [] if not text: return []
parts = _SENT_SPLIT.split(text) parts = _SENT_SPLIT.split(text)
@ -102,13 +112,26 @@ def split_sentences(text: str) -> list[str]:
@dataclass @dataclass
class RawBlock: class RawBlock:
kind: str; text: str; level: Optional[int]; section_path: str; section_title: Optional[str] kind: str
text: str
level: Optional[int]
section_path: str
section_title: Optional[str]
@dataclass @dataclass
class Chunk: class Chunk:
id: str; note_id: str; index: int; text: str; window: str; token_count: int id: str
section_title: Optional[str]; section_path: str note_id: str
neighbors_prev: Optional[str]; neighbors_next: Optional[str] index: int
text: str
window: str
token_count: int
section_title: Optional[str]
section_path: str
neighbors_prev: Optional[str]
neighbors_next: Optional[str]
# WP-15b: Liste von Kandidaten für die semantische Validierung
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
suggested_edges: Optional[List[str]] = None suggested_edges: Optional[List[str]] = None
# ========================================== # ==========================================
@ -118,7 +141,7 @@ class Chunk:
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
""" """
Zerlegt Text in logische Blöcke (Absätze, Header). Zerlegt Text in logische Blöcke (Absätze, Header).
Wichtig für die Strategie 'by_heading'. Wichtig für die Strategie 'by_heading' und die Edge-Inheritance.
""" """
blocks = [] blocks = []
h1_title = "Dokument" h1_title = "Dokument"
@ -165,14 +188,15 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]: def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
""" """
Die Standard-Strategie aus WP-15. Standard-Strategie aus WP-15.
Fasst Blöcke zusammen und schneidet bei 'target' Tokens (mit Satz-Rücksicht). Fasst Blöcke zusammen und schneidet bei 'target' Tokens.
""" """
target = config.get("target", 400) target = config.get("target", 400)
max_tokens = config.get("max", 600) max_tokens = config.get("max", 600)
overlap_val = config.get("overlap", (50, 80)) overlap_val = config.get("overlap", (50, 80))
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
chunks = []; buf = [] chunks = []
buf = []
def _create_chunk(txt, win, sec, path): def _create_chunk(txt, win, sec, path):
idx = len(chunks) idx = len(chunks)
@ -180,7 +204,7 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt), text=txt, window=win, token_count=estimate_tokens(txt),
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None, section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
suggested_edges=[] candidate_pool=[]
)) ))
def flush_buffer(): def flush_buffer():
@ -190,14 +214,11 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
text_body = "\n\n".join([b.text for b in buf]) text_body = "\n\n".join([b.text for b in buf])
sec_title = buf[-1].section_title if buf else None sec_title = buf[-1].section_title if buf else None
sec_path = buf[-1].section_path if buf else "/" sec_path = buf[-1].section_path if buf else "/"
# Context Prefix (z.B. H1) voranstellen für Embedding-Qualität
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
if estimate_tokens(text_body) <= max_tokens: if estimate_tokens(text_body) <= max_tokens:
_create_chunk(text_body, win_body, sec_title, sec_path) _create_chunk(text_body, win_body, sec_title, sec_path)
else: else:
# Zu groß -> Satzweiser Split
sentences = split_sentences(text_body) sentences = split_sentences(text_body)
current_chunk_sents = [] current_chunk_sents = []
current_len = 0 current_len = 0
@ -209,15 +230,13 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
_create_chunk(c_txt, c_win, sec_title, sec_path) _create_chunk(c_txt, c_win, sec_title, sec_path)
# Overlap für nächsten Chunk
overlap_sents = [] overlap_sents = []
ov_len = 0 ov_len = 0
for s in reversed(current_chunk_sents): for s in reversed(current_chunk_sents):
if ov_len + estimate_tokens(s) < overlap: if ov_len + estimate_tokens(s) < overlap:
overlap_sents.insert(0, s) overlap_sents.insert(0, s)
ov_len += estimate_tokens(s) ov_len += estimate_tokens(s)
else: else: break
break
current_chunk_sents = list(overlap_sents) current_chunk_sents = list(overlap_sents)
current_chunk_sents.append(sent) current_chunk_sents.append(sent)
@ -226,12 +245,10 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
current_chunk_sents.append(sent) current_chunk_sents.append(sent)
current_len += sent_len current_len += sent_len
# Rest
if current_chunk_sents: if current_chunk_sents:
c_txt = " ".join(current_chunk_sents) c_txt = " ".join(current_chunk_sents)
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
_create_chunk(c_txt, c_win, sec_title, sec_path) _create_chunk(c_txt, c_win, sec_title, sec_path)
buf = [] buf = []
for b in blocks: for b in blocks:
@ -248,132 +265,137 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]: def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
""" """
Strategie für strukturierte Daten (Profile, Werte). Hybrid-Strategie v2.9 (Strict/Soft/Safety-Net).
Nutzt sliding_window, forciert aber Schnitte an Headings (via parse_blocks Vorarbeit).
""" """
return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}") strict = config.get("strict_heading_split", False)
target = config.get("target", 400)
max_tokens = config.get("max", 600)
split_level = config.get("split_level", 2)
chunks = []
current_buf = []
current_tokens = 0
def _flush(sec_title, sec_path):
nonlocal current_buf, current_tokens
if not current_buf: return
txt = "\n\n".join(current_buf)
win = f"# {doc_title}\n## {sec_title}\n{txt}".strip() if sec_title else txt
idx = len(chunks)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=sec_title, section_path=sec_path,
neighbors_prev=None, neighbors_next=None,
candidate_pool=[]
))
current_buf = []
current_tokens = 0
for b in blocks:
if b.kind == "heading":
# Hierarchie-Check: Split bei Überschriften oberhalb des Split-Levels
if b.level < split_level:
_flush(b.section_title, b.section_path)
elif b.level == split_level:
if strict or current_tokens >= target:
_flush(b.section_title, b.section_path)
continue
block_tokens = estimate_tokens(b.text)
if current_tokens + block_tokens > max_tokens and current_buf:
_flush(b.section_title, b.section_path)
current_buf.append(b.text)
current_tokens += block_tokens
if current_buf:
last = blocks[-1] if blocks else None
_flush(last.section_title if last else None, last.section_path if last else "/")
return chunks
# ========================================== # ==========================================
# 4. ROBUST EDGE PARSING & PROPAGATION (NEU) # 4. ROBUST EDGE PARSING & PROPAGATION
# ========================================== # ==========================================
def _parse_edges_robust(text: str) -> Set[str]: def _parse_edges_robust(text: str) -> Set[str]:
""" """
NEU: Findet Kanten im Text, auch wenn sie mehrzeilig oder 'kaputt' formatiert sind. Findet Kanten im Text (Wikilinks, Inlines, Callouts).
Erkennt: Fix V3: Support für mehrzeilige Callouts.
> [!edge] type
> [[Link]]
Returns: Set von Strings "kind:target"
""" """
found_edges = set() found_edges = set()
# A. Inline [[rel:type|target]] (Standard) # A. Inline [[rel:type|target]]
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text) inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
for kind, target in inlines: for kind, target in inlines:
k = kind.strip() k = kind.strip().lower()
t = target.strip() t = target.strip()
if k and t: found_edges.add(f"{k}:{t}") if k and t: found_edges.add(f"{k}:{t}")
# B. Multiline Callouts Parsing (Der Fix für dein Problem) # B. Multiline Callouts Parsing (WP-15 Fix)
lines = text.split('\n') lines = text.split('\n')
current_edge_type = None current_edge_type = None
for line in lines: for line in lines:
stripped = line.strip() stripped = line.strip()
# 1. Start Blockquote: > [!edge] type
# (Erlaubt optionalen Doppelpunkt)
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped) callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
if callout_match: if callout_match:
current_edge_type = callout_match.group(1).strip() current_edge_type = callout_match.group(1).strip().lower()
# Check: Sind Links noch in der GLEICHEN Zeile?
links = re.findall(r'\[\[([^\]]+)\]\]', stripped) links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links: for l in links:
if "rel:" not in l: if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
found_edges.add(f"{current_edge_type}:{l}")
continue continue
# 2. Continuation Line: > [[Target]]
# Wenn wir noch im 'edge mode' sind und die Zeile ein Zitat ist
if current_edge_type and stripped.startswith('>'): if current_edge_type and stripped.startswith('>'):
links = re.findall(r'\[\[([^\]]+)\]\]', stripped) links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links: for l in links:
if "rel:" not in l: if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
found_edges.add(f"{current_edge_type}:{l}")
# 3. End of Blockquote (kein '>') -> Reset Type
elif not stripped.startswith('>'): elif not stripped.startswith('>'):
current_edge_type = None current_edge_type = None
return found_edges return found_edges
def _propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]: def _propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]:
""" """
NEU: Verteilt Kanten innerhalb einer Sektion. WP-15b: Implementiert Edge-Inheritance.
Löst das Problem: Callout steht oben im Kapitel, gilt aber für alle Chunks darunter. Kanten aus Überschriften werden an untergeordnete Chunks vererbt.
""" """
# Step 1: Sammeln pro Sektion section_inheritance: Dict[str, Set[str]] = {}
section_map = {} # path -> set(kind:target)
# 1. Sammeln aus den Heading-Blöcken
for b in blocks:
if b.kind == "heading":
edges = _parse_edges_robust(b.text)
if edges:
if b.section_path not in section_inheritance:
section_inheritance[b.section_path] = set()
section_inheritance[b.section_path].update(edges)
# 2. Injektion in den Candidate-Pool
for ch in chunks: for ch in chunks:
# Root-Level "/" ignorieren wir meist, da zu global inherited = section_inheritance.get(ch.section_path, set())
if not ch.section_path or ch.section_path == "/": continue for e_str in inherited:
kind, target = e_str.split(':', 1)
edges = _parse_edges_robust(ch.text) ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"})
if edges:
if ch.section_path not in section_map:
section_map[ch.section_path] = set()
section_map[ch.section_path].update(edges)
# Step 2: Injizieren (Broadcasting)
for ch in chunks:
if ch.section_path in section_map:
edges_to_add = section_map[ch.section_path]
if not edges_to_add: continue
injections = []
for e_str in edges_to_add:
kind, target = e_str.split(':', 1)
# Check: Kante schon im Text?
token = f"[[rel:{kind}|{target}]]"
if token not in ch.text:
injections.append(token)
if injections:
# Wir schreiben die Kanten "hart" in den Text.
# Damit findet sie derive_edges.py später garantiert.
block = "\n\n\n" + " ".join(injections)
ch.text += block
# Auch ins Window schreiben für Embedding-Kontext
ch.window += block
return chunks return chunks
# ========================================== # ==========================================
# 5. ORCHESTRATION (ASYNC) # 5. ORCHESTRATION (WP-15b)
# ========================================== # ==========================================
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
""" """
Hauptfunktion. Verbindet Parsing, Splitting und Edge-Allocation. Hauptfunktion zur Chunk-Generierung.
Baut den Candidate-Pool für die semantische Validierung auf.
""" """
# 1. Config laden (WP-15 Kompatibilität)
if config is None: if config is None:
config = get_chunk_config(note_type) config = get_chunk_config(note_type)
fm, body_text = extract_frontmatter_from_text(md_text) fm, body_text = extract_frontmatter_from_text(md_text)
note_status = fm.get("status", "").lower()
primary_strategy = config.get("strategy", "sliding_window") primary_strategy = config.get("strategy", "sliding_window")
enable_smart_edges = config.get("enable_smart_edge_allocation", False)
# Drafts skippen LLM um Kosten/Zeit zu sparen # 1. Parsing & Splitting
if enable_smart_edges and note_status in ["draft", "initial_gen"]:
logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.")
enable_smart_edges = False
# 2. Parsing & Splitting
blocks, doc_title = parse_blocks(md_text) blocks, doc_title = parse_blocks(md_text)
if primary_strategy == "by_heading": if primary_strategy == "by_heading":
@ -381,94 +403,45 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
else: else:
chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title) chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)
if not chunks: if not chunks: return []
return []
# 3. NEU: Propagation VOR Smart Edge Allocation # 2. WP-15b: Candidate Pool Vorbereitung
# Das repariert die fehlenden Kanten aus deinen Callouts.
chunks = _propagate_section_edges(chunks) # A. Edge Inheritance (Sektions-Propagation)
chunks = _propagate_section_edges(chunks, blocks)
# B. Explicit Edges (Direkt im Chunk-Text enthalten)
for ch in chunks:
explicit = _parse_edges_robust(ch.text)
for e_str in explicit:
kind, target = e_str.split(':', 1)
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})
# 4. Smart Edges (LLM) # C. Global "Unassigned Pool" Detection (Safety Net)
if enable_smart_edges: # Sucht nach einer Sektion "Unzugeordnete Kanten" im Body
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type) unassigned_pool = set()
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
if pool_match:
unassigned_pool = _parse_edges_robust(pool_match.group(1))
for ch in chunks:
for e_str in unassigned_pool:
kind, target = e_str.split(':', 1)
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})
# 5. Linking # D. De-Duplikation des Pools
for ch in chunks:
seen = set()
unique_pool = []
for cand in ch.candidate_pool:
key = (cand["kind"], cand["to"])
if key not in seen:
seen.add(key)
unique_pool.append(cand)
ch.candidate_pool = unique_pool
# 3. Nachbarschafts-Verkettung (Struktur-Kanten)
for i, ch in enumerate(chunks): for i, ch in enumerate(chunks):
ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_prev = chunks[i-1].id if i > 0 else None
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
return chunks
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
"""
Hilfsfunktion: Sammelt ALLE Kanten für den LLM-Kandidaten-Pool.
"""
# A. Via derive_edges (Standard)
dummy_chunk = {
"chunk_id": f"{note_id}#full",
"text": md_text,
"content": md_text,
"window": md_text,
"type": note_type
}
# Signatur-Anpassung beachten (WP-15 Fix)
raw_edges = build_edges_for_note(
note_id,
[dummy_chunk],
note_level_references=None,
include_note_scope_refs=False
)
all_candidates = set()
for e in raw_edges:
kind = e.get("kind")
target = e.get("target_id")
if target and kind not in ["belongs_to", "next", "prev", "backlink"]:
all_candidates.add(f"{kind}:{target}")
# B. Via Robust Parser (NEU) - fängt die multiline Callouts
robust_edges = _parse_edges_robust(md_text)
all_candidates.update(robust_edges)
return list(all_candidates)
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
"""
Der LLM-Schritt (WP-15). Filtert irrelevante Kanten.
"""
analyzer = get_semantic_analyzer()
candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type)
if not candidate_list:
return chunks
tasks = []
for chunk in chunks:
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
results_per_chunk = await asyncio.gather(*tasks)
assigned_edges_global = set()
for i, confirmed_edges in enumerate(results_per_chunk):
chunk = chunks[i]
chunk.suggested_edges = confirmed_edges
assigned_edges_global.update(confirmed_edges)
if confirmed_edges:
# Wir schreiben auch Smart Edges hart in den Text
injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e])
chunk.text += injection_str
chunk.window += injection_str
# Fallback für Kanten, die das LLM nirgendwo zugeordnet hat
# (Damit nichts verloren geht -> Safety Fallback)
unassigned = set(candidate_list) - assigned_edges_global
if unassigned:
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
for chunk in chunks:
chunk.text += fallback_str
chunk.window += fallback_str
if chunk.suggested_edges is None: chunk.suggested_edges = []
chunk.suggested_edges.extend(list(unassigned))
return chunks return chunks

View File

@ -1,17 +1,20 @@
""" """
FILE: app/core/derive_edges.py FILE: app/core/derive_edges.py
DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts. DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts.
VERSION: 2.0.0 WP-15b: Integration des Candidate-Pools und Provenance-Priorisierung.
Sichert die Graph-Integrität durch confidence-basiertes De-Duplicating.
VERSION: 2.1.0
STATUS: Active STATUS: Active
DEPENDENCIES: re, os, yaml, typing DEPENDENCIES: re, os, yaml, typing, hashlib
EXTERNAL_CONFIG: config/types.yaml EXTERNAL_CONFIG: config/types.yaml
LAST_ANALYSIS: 2025-12-15 LAST_ANALYSIS: 2025-12-26
""" """
from __future__ import annotations from __future__ import annotations
import os import os
import re import re
import hashlib
from typing import Iterable, List, Optional, Tuple, Set, Dict from typing import Iterable, List, Optional, Tuple, Set, Dict
try: try:
@ -20,17 +23,18 @@ except Exception: # pragma: no cover
yaml = None yaml = None
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# Utilities # 1. Utilities & ID Generation
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
def _get(d: dict, *keys, default=None): def _get(d: dict, *keys, default=None):
"""Sicherer Zugriff auf verschachtelte Dictionary-Keys."""
for k in keys: for k in keys:
if isinstance(d, dict) and k in d and d[k] is not None: if isinstance(d, dict) and k in d and d[k] is not None:
return d[k] return d[k]
return default return default
def _chunk_text_for_refs(chunk: dict) -> str: def _chunk_text_for_refs(chunk: dict) -> str:
# bevorzugt 'window' → dann 'text' → 'content' → 'raw' """Extrahiert den relevanten Text für die Referenzsuche (bevorzugt Window)."""
return ( return (
_get(chunk, "window") _get(chunk, "window")
or _get(chunk, "text") or _get(chunk, "text")
@ -40,6 +44,7 @@ def _chunk_text_for_refs(chunk: dict) -> str:
) )
def _dedupe_seq(seq: Iterable[str]) -> List[str]: def _dedupe_seq(seq: Iterable[str]) -> List[str]:
"""Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge."""
seen: Set[str] = set() seen: Set[str] = set()
out: List[str] = [] out: List[str] = []
for s in seq: for s in seq:
@ -49,9 +54,10 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]:
return out return out
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
"""Konstruiert ein valides Kanten-Payload-Objekt für Qdrant."""
pl = { pl = {
"kind": kind, "kind": kind,
"relation": kind, # Alias (v2) "relation": kind, # Alias für Abwärtskompatibilität (v2)
"scope": scope, # "chunk" | "note" "scope": scope, # "chunk" | "note"
"source_id": source_id, "source_id": source_id,
"target_id": target_id, "target_id": target_id,
@ -62,25 +68,38 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
return pl return pl
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
"""Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s."""
base = f"{kind}:{s}->{t}#{scope}" base = f"{kind}:{s}->{t}#{scope}"
if rule_id: if rule_id:
base += f"|{rule_id}" base += f"|{rule_id}"
try: try:
import hashlib
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
except Exception: # pragma: no cover except Exception: # pragma: no cover
return base return base
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# Typen-Registry (types.yaml) # 2. Konfiguration & Provenance-Skala
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# WP-15b: Prioritäten-Ranking für die De-Duplizierung
PROVENANCE_PRIORITY = {
"explicit:wikilink": 1.00,
"inline:rel": 0.95,
"callout:edge": 0.90,
"semantic_ai": 0.90, # Validierte KI-Kanten
"structure:belongs_to": 1.00,
"structure:order": 0.95, # next/prev
"explicit:note_scope": 1.00,
"derived:backlink": 0.90,
"edge_defaults": 0.70 # Heuristik (types.yaml)
}
def _env(n: str, default: Optional[str] = None) -> str: def _env(n: str, default: Optional[str] = None) -> str:
v = os.getenv(n) v = os.getenv(n)
return v if v is not None else (default or "") return v if v is not None else (default or "")
def _load_types_registry() -> dict: def _load_types_registry() -> dict:
"""Lädt die YAML-Registry aus MINDNET_TYPES_FILE oder ./config/types.yaml""" """Lädt die YAML-Registry zur Ermittlung von Standard-Kanten."""
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
if not os.path.isfile(p) or yaml is None: if not os.path.isfile(p) or yaml is None:
return {} return {}
@ -97,13 +116,7 @@ def _get_types_map(reg: dict) -> dict:
return reg if isinstance(reg, dict) else {} return reg if isinstance(reg, dict) else {}
def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]: def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
""" """Liefert die edge_defaults-Liste für den gegebenen Notiztyp."""
Liefert die edge_defaults-Liste für den gegebenen Notiztyp.
Fallback-Reihenfolge:
1) reg['types'][note_type]['edge_defaults']
2) reg['defaults']['edge_defaults'] (oder 'default'/'global')
3) []
"""
types_map = _get_types_map(reg) types_map = _get_types_map(reg)
if note_type and isinstance(types_map, dict): if note_type and isinstance(types_map, dict):
t = types_map.get(note_type) t = types_map.get(note_type)
@ -116,29 +129,19 @@ def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
return [] return []
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# Parser für Links / Relationen # 3. Parser für Links / Relationen (Core Logik v2.0.0)
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# Normale Wikilinks (Fallback) # Normale Wikilinks (Fallback)
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") _WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
# Getypte Inline-Relationen: # Getypte Inline-Relationen
# [[rel:KIND | Target]]
# [[rel:KIND Target]]
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
# rel: KIND [[Target]] (reines Textmuster)
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
""" """Extrahiert [[rel:KIND|Target]] und entfernt sie zur Vermeidung von Dubletten."""
Gibt Liste (kind, target) zurück und den Text mit entfernten getypten Relation-Links,
damit die generische Wikilink-Erkennung sie nicht doppelt zählt.
Unterstützt drei Varianten:
- [[rel:KIND | Target]]
- [[rel:KIND Target]]
- rel: KIND [[Target]]
"""
pairs: List[Tuple[str,str]] = [] pairs: List[Tuple[str,str]] = []
def _collect(m): def _collect(m):
k = (m.group("kind") or "").strip().lower() k = (m.group("kind") or "").strip().lower()
@ -152,17 +155,13 @@ def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
text = _REL_TEXT.sub(_collect, text) text = _REL_TEXT.sub(_collect, text)
return pairs, text return pairs, text
# Obsidian Callout Parser # Obsidian Callout Parser für mehrzeilige Blöcke
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) _CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE) _REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") _WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
""" """Verarbeitet [!edge]-Callouts und entfernt diese aus dem Textfluss."""
Findet [!edge]-Callouts und extrahiert (kind, target). Entfernt den gesamten
Callout-Block aus dem Text (damit Wikilinks daraus nicht zusätzlich als
"references" gezählt werden).
"""
if not text: if not text:
return [], text return [], text
@ -205,21 +204,20 @@ def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
t = raw.strip() t = raw.strip()
if t: if t:
out_pairs.append((kind, t)) out_pairs.append((kind, t))
# Callout wird NICHT in keep_lines übernommen
continue continue
remainder = "\n".join(keep_lines) remainder = "\n".join(keep_lines)
return out_pairs, remainder return out_pairs, remainder
def _extract_wikilinks(text: str) -> List[str]: def _extract_wikilinks(text: str) -> List[str]:
"""Extrahiert Standard-Wikilinks aus dem verbleibenden Text."""
ids: List[str] = [] ids: List[str] = []
for m in _WIKILINK_RE.finditer(text or ""): for m in _WIKILINK_RE.finditer(text or ""):
ids.append(m.group(1).strip()) ids.append(m.group(1).strip())
return ids return ids
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
# Hauptfunktion # 4. Hauptfunktion (build_edges_for_note)
# --------------------------------------------------------------------------- # # --------------------------------------------------------------------------- #
def build_edges_for_note( def build_edges_for_note(
@ -229,24 +227,13 @@ def build_edges_for_note(
include_note_scope_refs: bool = False, include_note_scope_refs: bool = False,
) -> List[dict]: ) -> List[dict]:
""" """
Erzeugt Kanten für eine Note. Erzeugt und aggregiert alle Kanten für eine Note inklusive WP-15b Candidate-Processing.
Setzt Provenance-Ranking zur Graph-Stabilisierung ein.
- belongs_to: für jeden Chunk (chunk -> note)
- next / prev: zwischen aufeinanderfolgenden Chunks
- references: pro Chunk aus window/text (via Wikilinks)
- typed inline relations: [[rel:KIND | Target]] / [[rel:KIND Target]] / rel: KIND [[Target]]
- Obsidian Callouts: > [!edge] KIND: [[Target]] [[Target2]]
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
- typenbasierte Default-Kanten (edge_defaults) je gefundener Referenz
""" """
edges: List[dict] = [] edges: List[dict] = []
note_type = _get(chunks[0], "type") if chunks else "concept"
# Note-Typ (aus erstem Chunk erwartet) # 1) Struktur-Kanten: belongs_to (Chunk -> Note)
note_type = None
if chunks:
note_type = _get(chunks[0], "type")
# 1) belongs_to
for ch in chunks: for ch in chunks:
cid = _get(ch, "chunk_id", "id") cid = _get(ch, "chunk_id", "id")
if not cid: if not cid:
@ -254,12 +241,12 @@ def build_edges_for_note(
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, { edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {
"chunk_id": cid, "chunk_id": cid,
"edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"), "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"),
"provenance": "rule", "provenance": "structure",
"rule_id": "structure:belongs_to", "rule_id": "structure:belongs_to",
"confidence": 1.0, "confidence": PROVENANCE_PRIORITY["structure:belongs_to"],
})) }))
# 2) next / prev # 2) Struktur-Kanten: next / prev (Sequenz)
for i in range(len(chunks) - 1): for i in range(len(chunks) - 1):
a, b = chunks[i], chunks[i + 1] a, b = chunks[i], chunks[i + 1]
a_id = _get(a, "chunk_id", "id") a_id = _get(a, "chunk_id", "id")
@ -269,19 +256,19 @@ def build_edges_for_note(
edges.append(_edge("next", "chunk", a_id, b_id, note_id, { edges.append(_edge("next", "chunk", a_id, b_id, note_id, {
"chunk_id": a_id, "chunk_id": a_id,
"edge_id": _mk_edge_id("next", a_id, b_id, "chunk", "structure:order"), "edge_id": _mk_edge_id("next", a_id, b_id, "chunk", "structure:order"),
"provenance": "rule", "provenance": "structure",
"rule_id": "structure:order", "rule_id": "structure:order",
"confidence": 0.95, "confidence": PROVENANCE_PRIORITY["structure:order"],
})) }))
edges.append(_edge("prev", "chunk", b_id, a_id, note_id, { edges.append(_edge("prev", "chunk", b_id, a_id, note_id, {
"chunk_id": b_id, "chunk_id": b_id,
"edge_id": _mk_edge_id("prev", b_id, a_id, "chunk", "structure:order"), "edge_id": _mk_edge_id("prev", b_id, a_id, "chunk", "structure:order"),
"provenance": "rule", "provenance": "structure",
"rule_id": "structure:order", "rule_id": "structure:order",
"confidence": 0.95, "confidence": PROVENANCE_PRIORITY["structure:order"],
})) }))
# 3) references + typed inline + callouts + defaults (chunk-scope) # 3) Inhaltliche Kanten (Refs, Inlines, Callouts, Candidates)
reg = _load_types_registry() reg = _load_types_registry()
defaults = _edge_defaults_for(note_type, reg) defaults = _edge_defaults_for(note_type, reg)
refs_all: List[str] = [] refs_all: List[str] = []
@ -292,51 +279,49 @@ def build_edges_for_note(
continue continue
raw = _chunk_text_for_refs(ch) raw = _chunk_text_for_refs(ch)
# 3a) typed inline relations # 3a) Typed Inline Relations
typed, remainder = _extract_typed_relations(raw) typed, remainder = _extract_typed_relations(raw)
for kind, target in typed: for kind, target in typed:
kind = kind.strip().lower() k = kind.strip().lower()
if not kind or not target: if not k or not target: continue
continue edges.append(_edge(k, "chunk", cid, target, note_id, {
edges.append(_edge(kind, "chunk", cid, target, note_id, {
"chunk_id": cid, "chunk_id": cid,
"edge_id": _mk_edge_id(kind, cid, target, "chunk", "inline:rel"), "edge_id": _mk_edge_id(k, cid, target, "chunk", "inline:rel"),
"provenance": "explicit", "provenance": "explicit",
"rule_id": "inline:rel", "rule_id": "inline:rel",
"confidence": 0.95, "confidence": PROVENANCE_PRIORITY["inline:rel"],
})) }))
if kind in {"related_to", "similar_to"}:
edges.append(_edge(kind, "chunk", target, cid, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(kind, target, cid, "chunk", "inline:rel"),
"provenance": "explicit",
"rule_id": "inline:rel",
"confidence": 0.95,
}))
# 3b) callouts # 3b) WP-15b Candidate Pool Integration (KI-validierte Kanten)
# Verarbeitet Kanten, die bereits in der Ingestion semantisch geprüft wurden.
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
for cand in pool:
target = cand.get("to")
kind = cand.get("kind", "related_to")
prov = cand.get("provenance", "semantic_ai")
if not target: continue
edges.append(_edge(kind, "chunk", cid, target, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(kind, cid, target, "chunk", f"candidate:{prov}"),
"provenance": prov,
"rule_id": f"candidate:{prov}",
"confidence": PROVENANCE_PRIORITY.get(prov, 0.90),
}))
# 3c) Obsidian Callouts
call_pairs, remainder2 = _extract_callout_relations(remainder) call_pairs, remainder2 = _extract_callout_relations(remainder)
for kind, target in call_pairs: for kind, target in call_pairs:
k = (kind or "").strip().lower() k = (kind or "").strip().lower()
if not k or not target: if not k or not target: continue
continue
edges.append(_edge(k, "chunk", cid, target, note_id, { edges.append(_edge(k, "chunk", cid, target, note_id, {
"chunk_id": cid, "chunk_id": cid,
"edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge"), "edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge"),
"provenance": "explicit", "provenance": "explicit",
"rule_id": "callout:edge", "rule_id": "callout:edge",
"confidence": 0.95, "confidence": PROVENANCE_PRIORITY["callout:edge"],
})) }))
if k in {"related_to", "similar_to"}:
edges.append(_edge(k, "chunk", target, cid, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(k, target, cid, "chunk", "callout:edge"),
"provenance": "explicit",
"rule_id": "callout:edge",
"confidence": 0.95,
}))
# 3c) generische Wikilinks → references (+ defaults je Ref) # 3d) Standard-Wikilinks -> references (+ defaults)
refs = _extract_wikilinks(remainder2) refs = _extract_wikilinks(remainder2)
for r in refs: for r in refs:
edges.append(_edge("references", "chunk", cid, r, note_id, { edges.append(_edge("references", "chunk", cid, r, note_id, {
@ -345,76 +330,65 @@ def build_edges_for_note(
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"), "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"),
"provenance": "explicit", "provenance": "explicit",
"rule_id": "explicit:wikilink", "rule_id": "explicit:wikilink",
"confidence": 1.0, "confidence": PROVENANCE_PRIORITY["explicit:wikilink"],
})) }))
# Regelbasierte Kanten aus types.yaml anhängen
for rel in defaults: for rel in defaults:
if rel == "references": if rel == "references": continue
continue
edges.append(_edge(rel, "chunk", cid, r, note_id, { edges.append(_edge(rel, "chunk", cid, r, note_id, {
"chunk_id": cid, "chunk_id": cid,
"edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{note_type}:{rel}"), "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule", "provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}", "rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7, "confidence": PROVENANCE_PRIORITY["edge_defaults"],
})) }))
if rel in {"related_to", "similar_to"}:
edges.append(_edge(rel, "chunk", r, cid, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(rel, r, cid, "chunk", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7,
}))
refs_all.extend(refs) refs_all.extend(refs)
# 4) optional note-scope refs/backlinks (+ defaults) # 4) Optionale Note-Scope Referenzen & Backlinks
if include_note_scope_refs: if include_note_scope_refs:
refs_note = list(refs_all or []) refs_note = list(refs_all or [])
if note_level_references: if note_level_references:
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r]) refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])
refs_note = _dedupe_seq(refs_note) refs_note = _dedupe_seq(refs_note)
for r in refs_note: for r in refs_note:
edges.append(_edge("references", "note", note_id, r, note_id, { edges.append(_edge("references", "note", note_id, r, note_id, {
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"), "edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
"provenance": "explicit", "provenance": "explicit",
"rule_id": "explicit:note_scope", "rule_id": "explicit:note_scope",
"confidence": 1.0, "confidence": PROVENANCE_PRIORITY["explicit:note_scope"],
})) }))
# Backlink-Erzeugung zur Graphen-Stärkung
edges.append(_edge("backlink", "note", r, note_id, note_id, { edges.append(_edge("backlink", "note", r, note_id, note_id, {
"edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"), "edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"),
"provenance": "rule", "provenance": "rule",
"rule_id": "derived:backlink", "rule_id": "derived:backlink",
"confidence": 0.9, "confidence": PROVENANCE_PRIORITY["derived:backlink"],
})) }))
for rel in defaults: for rel in defaults:
if rel == "references": if rel == "references": continue
continue
edges.append(_edge(rel, "note", note_id, r, note_id, { edges.append(_edge(rel, "note", note_id, r, note_id, {
"edge_id": _mk_edge_id(rel, note_id, r, "note", f"edge_defaults:{note_type}:{rel}"), "edge_id": _mk_edge_id(rel, note_id, r, "note", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule", "provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}", "rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7, "confidence": PROVENANCE_PRIORITY["edge_defaults"],
})) }))
if rel in {"related_to", "similar_to"}:
edges.append(_edge(rel, "note", r, note_id, note_id, {
"edge_id": _mk_edge_id(rel, r, note_id, "note", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7,
}))
# 5) De-Dupe (source_id, target_id, relation, rule_id) # 5) WP-15b: Confidence-basierte De-Duplizierung
seen: Set[Tuple[str,str,str,str]] = set() # Wenn dieselbe Relation mehrfach existiert, gewinnt die mit der höchsten Confidence.
out: List[dict] = [] unique_map: Dict[Tuple[str, str, str], dict] = {}
for e in edges: for e in edges:
s = str(e.get("source_id") or "") s, t = str(e.get("source_id")), str(e.get("target_id"))
t = str(e.get("target_id") or "")
rel = str(e.get("relation") or e.get("kind") or "edge") rel = str(e.get("relation") or e.get("kind") or "edge")
rule = str(e.get("rule_id") or "") key = (s, t, rel)
key = (s, t, rel, rule)
if key in seen: if key not in unique_map:
continue unique_map[key] = e
seen.add(key) else:
out.append(e) # Vergleich der Vertrauenswürdigkeit (Provenance Ranking)
return out if e.get("confidence", 0) > unique_map[key].get("confidence", 0):
unique_map[key] = e
return list(unique_map.values())

View File

@ -3,12 +3,12 @@ FILE: app/core/ingestion.py
DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen. DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen.
WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free). WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free).
WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash. WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash.
FIX: Deep Fallback Logic (v2.11.14). Erkennt Policy Violations auch in validen WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation.
JSON-Objekten und erzwingt den lokalen Ollama-Sprung, um Kantenverlust FIX: Beibehaltung der Deep Fallback Logic (v2.11.14) zur JSON-Recovery.
bei umfangreichen Protokollen zu verhindern. VERSION: 2.12.0
VERSION: 2.11.14
STATUS: Active STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker,
app.services.llm_service, app.services.edge_registry
""" """
import os import os
import json import json
@ -21,9 +21,11 @@ from typing import Dict, List, Optional, Tuple, Any
# Core Module Imports # Core Module Imports
from app.core.parser import ( from app.core.parser import (
read_markdown, read_markdown,
pre_scan_markdown,
normalize_frontmatter, normalize_frontmatter,
validate_required_frontmatter, validate_required_frontmatter,
extract_edges_with_context, extract_edges_with_context,
NoteContext
) )
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks, get_chunk_config from app.core.chunker import assemble_chunks, get_chunk_config
@ -49,7 +51,7 @@ from app.services.llm_service import LLMService
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# --- Global Helpers --- # --- Global Helpers (Full Compatibility v2.11.14) ---
def extract_json_from_response(text: str) -> Any: def extract_json_from_response(text: str) -> Any:
""" """
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama). Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
@ -115,6 +117,7 @@ class IngestionService:
self.llm = LLMService() self.llm = LLMService()
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
try: try:
ensure_collections(self.client, self.prefix, self.dim) ensure_collections(self.client, self.prefix, self.dim)
@ -122,6 +125,54 @@ class IngestionService:
except Exception as e: except Exception as e:
logger.warning(f"DB init warning: {e}") logger.warning(f"DB init warning: {e}")
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
"""
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
Pass 1: Pre-Scan baut Kontext-Cache auf.
Pass 2: Processing führt semantische Validierung durch.
"""
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Batch Cache...")
for path in file_paths:
ctx = pre_scan_markdown(path)
if ctx:
self.batch_cache[ctx.note_id] = ctx
logger.info(f"🚀 [Pass 2] Processing {len(file_paths)} files...")
results = []
for path in file_paths:
res = await self.process_file(path, vault_root, apply=True)
results.append(res)
return results
async def _validate_candidate(self, chunk_text: str, edge: Dict) -> bool:
"""
WP-15b: Validiert einen Kanten-Kandidaten semantisch gegen das Ziel.
Nutzt den Cache aus Pass 1, um dem LLM Kontext der Ziel-Note zu geben.
"""
target_id = edge.get("to")
target_ctx = self.batch_cache.get(target_id)
# Falls Zielnotiz nicht im aktuellen Batch ist: 'explicit' durchlassen (Hard-Link Integrity)
if not target_ctx:
return True
provider = self.settings.MINDNET_LLM_PROVIDER
template = self.llm.get_prompt("edge_validation", provider)
try:
prompt = template.format(
chunk_text=chunk_text[:1500],
target_title=target_ctx.title,
target_summary=target_ctx.summary,
edge_kind=edge.get("kind", "related_to")
)
response = await self.llm.generate_raw_response(prompt, priority="background")
return "YES" in response.upper()
except Exception as e:
logger.warning(f"⚠️ Semantic validation error for {target_id}: {e}")
return True # Fallback: Im Zweifel Link behalten
def _resolve_note_type(self, requested: Optional[str]) -> str: def _resolve_note_type(self, requested: Optional[str]) -> str:
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept').""" """Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
types = self.registry.get("types", {}) types = self.registry.get("types", {})
@ -138,109 +189,12 @@ class IngestionService:
return cfg return cfg
return get_chunk_config(note_type) return get_chunk_config(note_type)
async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]:
"""
KI-Extraktion mit Deep-Fallback Logik.
Erzwingt den lokalen Ollama-Sprung, wenn die Cloud-Antwort keine verwertbaren
Kanten liefert (häufig bei Policy Violations auf OpenRouter).
"""
provider = self.settings.MINDNET_LLM_PROVIDER
model = self.settings.OPENROUTER_MODEL if provider == "openrouter" else self.settings.GEMINI_MODEL
logger.info(f"🚀 [Ingestion] Turbo-Mode: Extracting edges for '{note_id}' using {model} on {provider}")
edge_registry.ensure_latest()
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
template = self.llm.get_prompt("edge_extraction", provider)
try:
try:
# Wir begrenzen den Kontext auf 6000 Zeichen (ca. 1500 Token)
prompt = template.format(
text=text[:6000],
note_id=note_id,
valid_types=valid_types_str
)
except KeyError as ke:
logger.error(f"❌ [Ingestion] Prompt-Template Fehler (Variable {ke} fehlt).")
return []
# 1. Versuch: Anfrage an den primären Cloud-Provider
response_json = await self.llm.generate_raw_response(
prompt=prompt, priority="background", force_json=True,
provider=provider, model_override=model
)
# Initiales Parsing
raw_data = extract_json_from_response(response_json)
# 2. Dictionary Recovery (Versuche Liste aus Dict zu extrahieren)
candidates = []
if isinstance(raw_data, list):
candidates = raw_data
elif isinstance(raw_data, dict):
logger.info(f" [Ingestion] LLM returned dict, checking for embedded lists in {note_id}")
for k in ["edges", "links", "results", "kanten", "matches", "edge_list"]:
if k in raw_data and isinstance(raw_data[k], list):
candidates = raw_data[k]
break
# Wenn immer noch keine Liste gefunden, versuche Key-Value Paare (Dict Recovery)
if not candidates:
for k, v in raw_data.items():
if isinstance(v, str): candidates.append(f"{k}:{v}")
elif isinstance(v, list): [candidates.append(f"{k}:{i}") for i in v if isinstance(i, str)]
# 3. DEEP FALLBACK: Wenn nach allen Recovery-Versuchen die Liste leer ist UND wir in der Cloud waren
# Triggert den Fallback bei "Data Policy Violations" (leere oder Fehler-JSONs).
if not candidates and provider != "ollama" and self.settings.LLM_FALLBACK_ENABLED:
logger.warning(
f"🛑 [Ingestion] Cloud-Antwort für {note_id} lieferte keine verwertbaren Kanten. "
f"Mögliche Policy Violation oder Refusal. Erzwinge LOKALEN FALLBACK via Ollama..."
)
response_json_local = await self.llm.generate_raw_response(
prompt=prompt, priority="background", force_json=True, provider="ollama"
)
raw_data_local = extract_json_from_response(response_json_local)
# Wiederhole Recovery für lokale Antwort
if isinstance(raw_data_local, list):
candidates = raw_data_local
elif isinstance(raw_data_local, dict):
for k in ["edges", "links", "results"]:
if k in raw_data_local and isinstance(raw_data_local[k], list):
candidates = raw_data_local[k]; break
if not candidates:
logger.warning(f"⚠️ [Ingestion] Auch nach Fallback keine extrahierbaren Kanten für {note_id}")
return []
processed = []
for item in candidates:
if isinstance(item, dict) and "to" in item:
item["provenance"] = "semantic_ai"
item["line"] = f"ai-{provider}"
processed.append(item)
elif isinstance(item, str) and ":" in item:
parts = item.split(":", 1)
processed.append({
"to": parts[1].strip(),
"kind": parts[0].strip(),
"provenance": "semantic_ai",
"line": f"ai-{provider}"
})
return processed
except Exception as e:
logger.warning(f"⚠️ [Ingestion] Smart Edge Allocation failed for {note_id}: {e}")
return []
async def process_file( async def process_file(
self, file_path: str, vault_root: str, self, file_path: str, vault_root: str,
force_replace: bool = False, apply: bool = False, purge_before: bool = False, force_replace: bool = False, apply: bool = False, purge_before: bool = False,
note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical" note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical"
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Transformiert eine Markdown-Datei in den Graphen (Notes, Chunks, Edges).""" """Transformiert eine Markdown-Datei in den Graphen."""
result = {"path": file_path, "status": "skipped", "changed": False, "error": None} result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
# 1. Parse & Lifecycle Gate # 1. Parse & Lifecycle Gate
@ -252,12 +206,12 @@ class IngestionService:
except Exception as e: except Exception as e:
return {**result, "error": f"Validation failed: {str(e)}"} return {**result, "error": f"Validation failed: {str(e)}"}
# WP-22: Filter für Systemdateien und Entwürfe # Lifecycle Filter (WP-22)
status = fm.get("status", "draft").lower().strip() status = fm.get("status", "draft").lower().strip()
if status in ["system", "template", "archive", "hidden"]: if status in ["system", "template", "archive", "hidden"]:
return {**result, "status": "skipped", "reason": f"lifecycle_{status}"} return {**result, "status": "skipped", "reason": f"lifecycle_{status}"}
# 2. Config Resolution & Payload Construction # 2. Config Resolution & Payload
note_type = self._resolve_note_type(fm.get("type")) note_type = self._resolve_note_type(fm.get("type"))
fm["type"] = note_type fm["type"] = note_type
@ -267,15 +221,13 @@ class IngestionService:
except Exception as e: except Exception as e:
return {**result, "error": f"Payload failed: {str(e)}"} return {**result, "error": f"Payload failed: {str(e)}"}
# 3. Change Detection (Strikte DoD Umsetzung) # 3. Change Detection (v2.11.14 Logic)
old_payload = None if force_replace else self._fetch_note_payload(note_id) old_payload = None if force_replace else self._fetch_note_payload(note_id)
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}" check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
old_hash = (old_payload or {}).get("hashes", {}).get(check_key) old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
new_hash = note_pl.get("hashes", {}).get(check_key) new_hash = note_pl.get("hashes", {}).get(check_key)
# Prüfung auf fehlende Artefakte in Qdrant
chunks_missing, edges_missing = self._artifacts_missing(note_id) chunks_missing, edges_missing = self._artifacts_missing(note_id)
should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing
if not should_write: if not should_write:
@ -284,40 +236,42 @@ class IngestionService:
if not apply: if not apply:
return {**result, "status": "dry-run", "changed": True, "note_id": note_id} return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
# 4. Processing (Chunking, Embedding, AI Edges) # 4. Processing (Chunking, Embedding, Validated Edges)
try: try:
body_text = getattr(parsed, "body", "") or "" body_text = getattr(parsed, "body", "") or ""
edge_registry.ensure_latest() edge_registry.ensure_latest()
# Profil-gesteuertes Chunking # Chunker Resolution
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard" profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
chunk_cfg = self._get_chunk_config_by_profile(profile, note_type) chunk_cfg = self._get_chunk_config_by_profile(profile, note_type)
chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg) chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg)
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
# Vektorisierung # Embeddings
vecs = [] vecs = []
if chunk_pls: if chunk_pls:
texts = [c.get("window") or c.get("text") or "" for c in chunk_pls] texts = [c.get("window") or c.get("text") or "" for c in chunk_pls]
vecs = await self.embedder.embed_documents(texts) vecs = await self.embedder.embed_documents(texts)
# Kanten-Extraktion # Kanten-Extraktion & WP-15b Validierung
edges = [] edges = []
context = {"file": file_path, "note_id": note_id} context = {"file": file_path, "note_id": note_id}
# A. Explizite Kanten (User / Wikilinks) # A. Explizite Kandidaten (Wikilinks)
for e in extract_edges_with_context(parsed): raw_candidates = extract_edges_with_context(parsed)
e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")}) for cand in raw_candidates:
edges.append(e) # Semantische Prüfung gegen Pass 1 Cache
if await self._validate_candidate(body_text, cand):
cand["kind"] = edge_registry.resolve(
edge_type=cand["kind"],
provenance="explicit",
context={**context, "line": cand.get("line")}
)
edges.append(cand)
else:
logger.info(f"🚫 WP-15b: Candidate rejected: {cand['kind']} -> {cand['to']}")
# B. KI Kanten (Turbo Mode mit v2.11.14 Fallback) # B. System Kanten (Struktur)
ai_edges = await self._perform_smart_edge_allocation(body_text, note_id)
for e in ai_edges:
valid_kind = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")})
e["kind"] = valid_kind
edges.append(e)
# C. System Kanten (Struktur)
try: try:
sys_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs) sys_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs)
except: except:

View File

@ -2,10 +2,11 @@
FILE: app/core/parser.py FILE: app/core/parser.py
DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body. DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry. WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry.
VERSION: 1.8.0 WP-15b: Implementierung NoteContext und pre_scan_markdown für Pass 1 Ingestion.
VERSION: 1.9.0
STATUS: Active STATUS: Active
DEPENDENCIES: yaml, re, dataclasses, json, io, os DEPENDENCIES: yaml, re, dataclasses, json, io, os
LAST_ANALYSIS: 2025-12-23 LAST_ANALYSIS: 2025-12-26
""" """
from __future__ import annotations from __future__ import annotations
@ -32,6 +33,15 @@ class ParsedNote:
body: str body: str
path: str path: str
@dataclass
class NoteContext:
"""Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1)."""
note_id: str
title: str
type: str
summary: str
tags: List[str]
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# Frontmatter-Erkennung # Frontmatter-Erkennung
@ -152,6 +162,32 @@ def read_markdown(path: str) -> Optional[ParsedNote]:
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path) return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
def pre_scan_markdown(path: str) -> Optional[NoteContext]:
"""
WP-15b: Schneller Scan für den LocalBatchCache (Pass 1).
Extrahiert nur Identität und Kurz-Kontext zur semantischen Validierung.
"""
parsed = read_markdown(path)
if not parsed:
return None
fm = parsed.frontmatter
# ID-Findung: Frontmatter ID oder Dateiname als Fallback
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
# Erstelle Kurz-Zusammenfassung (erste 500 Zeichen des Body, bereinigt)
clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip()
summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body
return NoteContext(
note_id=note_id,
title=str(fm.get("title", note_id)),
type=str(fm.get("type", "concept")),
summary=summary,
tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else []
)
def validate_required_frontmatter(fm: Dict[str, Any], def validate_required_frontmatter(fm: Dict[str, Any],
required: Tuple[str, ...] = ("id", "title")) -> None: required: Tuple[str, ...] = ("id", "title")) -> None:
""" """

View File

@ -1,11 +1,14 @@
""" """
FILE: app/services/edge_registry.py FILE: app/services/edge_registry.py
DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload. DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload.
WP-15b: Erweiterte Provenance-Prüfung für die Candidate-Validation.
Sichert die Graph-Integrität durch strikte Trennung von System- und Inhaltskanten.
WP-22: Fix für absolute Pfade außerhalb des Vaults (Prod-Dictionary). WP-22: Fix für absolute Pfade außerhalb des Vaults (Prod-Dictionary).
WP-20: Synchronisation mit zentralen Settings (v0.6.2). WP-20: Synchronisation mit zentralen Settings (v0.6.2).
VERSION: 0.7.5 VERSION: 0.8.0
STATUS: Active STATUS: Active
DEPENDENCIES: re, os, json, logging, time, app.config DEPENDENCIES: re, os, json, logging, time, app.config
LAST_ANALYSIS: 2025-12-26
""" """
import re import re
import os import os
@ -19,7 +22,12 @@ from app.config import get_settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class EdgeRegistry: class EdgeRegistry:
"""
Zentraler Verwalter für das Kanten-Vokabular.
Implementiert das Singleton-Pattern für konsistente Validierung über alle Services.
"""
_instance = None _instance = None
# System-Kanten, die nicht durch User oder KI gesetzt werden dürfen
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"} FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
@ -51,7 +59,7 @@ class EdgeRegistry:
def ensure_latest(self): def ensure_latest(self):
""" """
Prüft den Zeitstempel der Vokabular-Datei und lädt bei Bedarf neu. Prüft den Zeitstempel der Vokabular-Datei und lädt bei Bedarf neu.
Verhindert den AttributeError in der Ingestion-Pipeline. Verhindert Inkonsistenzen bei Laufzeit-Updates des Dictionaries.
""" """
if not os.path.exists(self.full_vocab_path): if not os.path.exists(self.full_vocab_path):
logger.error(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!") logger.error(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!")
@ -66,7 +74,10 @@ class EdgeRegistry:
logger.error(f"!!! [EDGE-REGISTRY] Error checking file time: {e}") logger.error(f"!!! [EDGE-REGISTRY] Error checking file time: {e}")
def _load_vocabulary(self): def _load_vocabulary(self):
"""Parst das Markdown-Wörterbuch und baut die Canonical-Map auf.""" """
Parst das Markdown-Wörterbuch und baut die Canonical-Map auf.
Erkennt Tabellen-Strukturen und extrahiert fettgedruckte System-Typen.
"""
self.canonical_map.clear() self.canonical_map.clear()
self.valid_types.clear() self.valid_types.clear()
@ -101,8 +112,8 @@ class EdgeRegistry:
def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str: def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str:
""" """
Validiert einen Kanten-Typ gegen das Vokabular. WP-15b: Validiert einen Kanten-Typ gegen das Vokabular und prüft Berechtigungen.
Loggt unbekannte Typen für die spätere manuelle Pflege. Sichert, dass nur strukturelle Prozesse System-Kanten setzen dürfen.
""" """
self.ensure_latest() self.ensure_latest()
if not edge_type: if not edge_type:
@ -112,20 +123,23 @@ class EdgeRegistry:
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_") clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
ctx = context or {} ctx = context or {}
# System-Kanten dürfen nicht manuell vergeben werden # WP-15b: System-Kanten dürfen weder manuell noch durch KI/Vererbung gesetzt werden.
if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES: # Nur Provenienz 'structure' (interne Prozesse) ist autorisiert.
self._log_issue(clean_type, "forbidden_system_usage", ctx) # Wir blockieren hier alle Provenienzen außer 'structure'.
restricted_provenance = ["explicit", "semantic_ai", "inherited", "global_pool", "rule"]
if provenance in restricted_provenance and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
self._log_issue(clean_type, f"forbidden_usage_by_{provenance}", ctx)
return "related_to" return "related_to"
# System-Kanten sind nur bei struktureller Provenienz erlaubt # System-Kanten sind NUR bei struktureller Provenienz erlaubt
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES: if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
return clean_type return clean_type
# Mapping auf kanonischen Namen # Mapping auf kanonischen Namen (Alias-Auflösung)
if clean_type in self.canonical_map: if clean_type in self.canonical_map:
return self.canonical_map[clean_type] return self.canonical_map[clean_type]
# Fallback und Logging # Fallback und Logging unbekannter Typen für Admin-Review
self._log_issue(clean_type, "unknown_type", ctx) self._log_issue(clean_type, "unknown_type", ctx)
return clean_type return clean_type
@ -139,12 +153,13 @@ class EdgeRegistry:
"error": error_kind, "error": error_kind,
"file": ctx.get("file", "unknown"), "file": ctx.get("file", "unknown"),
"line": ctx.get("line", "unknown"), "line": ctx.get("line", "unknown"),
"note_id": ctx.get("note_id", "unknown") "note_id": ctx.get("note_id", "unknown"),
"provenance": ctx.get("provenance", "unknown")
} }
with open(self.unknown_log_path, "a", encoding="utf-8") as f: with open(self.unknown_log_path, "a", encoding="utf-8") as f:
f.write(json.dumps(entry) + "\n") f.write(json.dumps(entry) + "\n")
except Exception: except Exception:
pass pass
# Singleton Export # Singleton Export für systemweiten Zugriff
registry = EdgeRegistry() registry = EdgeRegistry()

View File

@ -1,6 +1,7 @@
# config/prompts.yaml — Final V2.5.5 (OpenRouter Hardening) # config/prompts.yaml — Final V2.6.0 (WP-15b Candidate-Validation)
# WP-20: Optimierte Cloud-Templates zur Unterdrückung von Modell-Geschwätz. # WP-20: Optimierte Cloud-Templates zur Unterdrückung von Modell-Geschwätz.
# FIX: Explizite Verbote für Einleitungstexte zur Vermeidung von JSON-Parsing-Fehlern. # FIX: Explizite Verbote für Einleitungstexte zur Vermeidung von JSON-Parsing-Fehlern.
# WP-15b: Integration der binären edge_validation für den Two-Pass Workflow.
# OLLAMA: UNVERÄNDERT laut Benutzeranweisung. # OLLAMA: UNVERÄNDERT laut Benutzeranweisung.
system_prompt: | system_prompt: |
@ -215,7 +216,7 @@ edge_extraction:
4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten. 4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten.
BEISPIEL: BEISPIEL:
[[ {{"to": "Ziel-Konzept", "kind": "beziehungs_typ"}} ]] [[ {{"to": "Ziel-Konzept", \"kind\": \"beziehungs_typ\"}} ]]
TEXT: TEXT:
""" """
@ -227,13 +228,46 @@ edge_extraction:
Analysiere '{note_id}'. Extrahiere semantische Beziehungen. Analysiere '{note_id}'. Extrahiere semantische Beziehungen.
ERLAUBTE TYPEN: {valid_types} ERLAUBTE TYPEN: {valid_types}
TEXT: {text} TEXT: {text}
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to":"Ziel","kind":"typ"}}]]. Kein Text davor/danach. Wenn nichts: []. OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to\":\"Ziel\",\"kind\":\"typ\"}}]]. Kein Text davor/danach. Wenn nichts: [].
openrouter: | openrouter: |
TASK: Extrahiere semantische Relationen für '{note_id}'. TASK: Extrahiere semantische Relationen für '{note_id}'.
ERLAUBTE TYPEN: {valid_types} ERLAUBTE TYPEN: {valid_types}
TEXT: {text} TEXT: {text}
ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten. ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten.
FORMAT: [[{{"to":"Ziel-Begriff","kind":"typ"}}]] FORMAT: [[{{"to\":\"Ziel-Begriff\",\"kind\":\"typ\"}}]]
STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen. STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen.
Wenn keine Relationen existieren, antworte NUR mit: [] Wenn keine Relationen existieren, antworte NUR mit: []
OUTPUT: OUTPUT:
# ---------------------------------------------------------
# 8. WP-15b: EDGE VALIDATION (Intent: VALIDATE)
# ---------------------------------------------------------
edge_validation:
gemini: |
Bewerte die semantische Validität dieser Verbindung im Wissensgraph.
KONTEXT DER QUELLE (Chunk):
"{chunk_text}"
ZIEL-NOTIZ: "{target_title}"
ZIEL-BESCHREIBUNG (Zusammenfassung):
"{target_summary}"
GEPLANTE RELATION: "{edge_kind}"
FRAGE: Bestätigt der Kontext der Quelle die Beziehung '{edge_kind}' zum Ziel?
REGEL: Antworte NUR mit 'YES' oder 'NO'. Keine Erklärungen oder Smalltalk.
openrouter: |
Verify semantic relation for graph construction.
Source Context: {chunk_text}
Target Note: {target_title}
Target Summary: {target_summary}
Proposed Relation: {edge_kind}
Instruction: Does the source context support this relation to the target?
Result: Respond ONLY with 'YES' or 'NO'.
ollama: |
Bewerte die semantische Korrektheit dieser Verbindung.
QUELLE: {chunk_text}
ZIEL: {target_title} ({target_summary})
BEZIEHUNG: {edge_kind}
Ist diese Verbindung valide? Antworte NUR mit YES oder NO.