WP15b - Initial

This commit is contained in:
Lars 2025-12-26 21:52:08 +01:00
parent d1a065fec8
commit f6b2375d65
6 changed files with 441 additions and 455 deletions

View File

@ -1,13 +1,16 @@
"""
FILE: app/core/chunker.py
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings).
Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer.
FIX V3: Support für mehrzeilige Callouts und Section-Propagation.
VERSION: 3.1.0 (Full Compatibility Merge)
WP-15b: Implementiert Edge-Inheritance und Candidate-Pool Vorbereitung.
Zentralisiert die Kanten-Vorbereitung für die spätere binäre Validierung.
Bietet volle Unterstützung für Hybrid-Chunking (Strict/Soft/Safety-Net).
VERSION: 3.2.0
STATUS: Active
DEPENDENCIES: re, math, yaml, pathlib, asyncio, logging
"""
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple, Any, Set
import re
import math
@ -17,15 +20,18 @@ import asyncio
import logging
# Services
from app.services.semantic_analyzer import get_semantic_analyzer
# In WP-15b wird die KI-Validierung in die ingestion.py verlagert.
# Wir behalten den Import für Abwärtskompatibilität, falls Legacy-Skripte ihn benötigen.
try:
from app.services.semantic_analyzer import get_semantic_analyzer
except ImportError:
def get_semantic_analyzer(): return None
# Core Imports
# Wir importieren build_edges_for_note nur, um kompatibel zur Signatur zu bleiben
# oder für den Fallback.
try:
from app.core.derive_edges import build_edges_for_note
except ImportError:
# Mock für Tests
# Fallback für Standalone-Betrieb oder Tests
def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []
logger = logging.getLogger(__name__)
@ -54,7 +60,7 @@ def _load_yaml_config() -> Dict[str, Any]:
def get_chunk_config(note_type: str) -> Dict[str, Any]:
"""
Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml.
Dies sichert die Kompatibilität zu WP-15 (Profile).
Sichert die Kompatibilität zu WP-15 Profilen.
"""
full_config = _load_yaml_config()
profiles = full_config.get("chunking_profiles", {})
@ -75,6 +81,7 @@ def get_chunk_config(note_type: str) -> Dict[str, Any]:
return config
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
"""Trennt YAML-Frontmatter vom eigentlichen Text."""
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
if not fm_match: return {}, md_text
try:
@ -89,12 +96,15 @@ def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
# 2. DATA CLASSES & TEXT TOOLS
# ==========================================
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])'); _WS = re.compile(r'\s+')
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
_WS = re.compile(r'\s+')
def estimate_tokens(text: str) -> int:
"""Grobe Schätzung der Token-Anzahl (4 Zeichen pro Token)."""
return max(1, math.ceil(len(text.strip()) / 4))
def split_sentences(text: str) -> list[str]:
"""Teilt Text in Sätze auf unter Berücksichtigung von Interpunktion."""
text = _WS.sub(' ', text.strip())
if not text: return []
parts = _SENT_SPLIT.split(text)
@ -102,13 +112,26 @@ def split_sentences(text: str) -> list[str]:
@dataclass
class RawBlock:
kind: str; text: str; level: Optional[int]; section_path: str; section_title: Optional[str]
kind: str
text: str
level: Optional[int]
section_path: str
section_title: Optional[str]
@dataclass
class Chunk:
id: str; note_id: str; index: int; text: str; window: str; token_count: int
section_title: Optional[str]; section_path: str
neighbors_prev: Optional[str]; neighbors_next: Optional[str]
id: str
note_id: str
index: int
text: str
window: str
token_count: int
section_title: Optional[str]
section_path: str
neighbors_prev: Optional[str]
neighbors_next: Optional[str]
# WP-15b: Liste von Kandidaten für die semantische Validierung
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
suggested_edges: Optional[List[str]] = None
# ==========================================
@ -118,7 +141,7 @@ class Chunk:
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
"""
Zerlegt Text in logische Blöcke (Absätze, Header).
Wichtig für die Strategie 'by_heading'.
Wichtig für die Strategie 'by_heading' und die Edge-Inheritance.
"""
blocks = []
h1_title = "Dokument"
@ -165,14 +188,15 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
"""
Die Standard-Strategie aus WP-15.
Fasst Blöcke zusammen und schneidet bei 'target' Tokens (mit Satz-Rücksicht).
Standard-Strategie aus WP-15.
Fasst Blöcke zusammen und schneidet bei 'target' Tokens.
"""
target = config.get("target", 400)
max_tokens = config.get("max", 600)
overlap_val = config.get("overlap", (50, 80))
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
chunks = []; buf = []
chunks = []
buf = []
def _create_chunk(txt, win, sec, path):
idx = len(chunks)
@ -180,7 +204,7 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
suggested_edges=[]
candidate_pool=[]
))
def flush_buffer():
@ -190,14 +214,11 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
text_body = "\n\n".join([b.text for b in buf])
sec_title = buf[-1].section_title if buf else None
sec_path = buf[-1].section_path if buf else "/"
# Context Prefix (z.B. H1) voranstellen für Embedding-Qualität
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
if estimate_tokens(text_body) <= max_tokens:
_create_chunk(text_body, win_body, sec_title, sec_path)
else:
# Zu groß -> Satzweiser Split
sentences = split_sentences(text_body)
current_chunk_sents = []
current_len = 0
@ -209,15 +230,13 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
_create_chunk(c_txt, c_win, sec_title, sec_path)
# Overlap für nächsten Chunk
overlap_sents = []
ov_len = 0
for s in reversed(current_chunk_sents):
if ov_len + estimate_tokens(s) < overlap:
overlap_sents.insert(0, s)
ov_len += estimate_tokens(s)
else:
break
else: break
current_chunk_sents = list(overlap_sents)
current_chunk_sents.append(sent)
@ -226,12 +245,10 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
current_chunk_sents.append(sent)
current_len += sent_len
# Rest
if current_chunk_sents:
c_txt = " ".join(current_chunk_sents)
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
_create_chunk(c_txt, c_win, sec_title, sec_path)
buf = []
for b in blocks:
@ -248,132 +265,137 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
"""
Strategie für strukturierte Daten (Profile, Werte).
Nutzt sliding_window, forciert aber Schnitte an Headings (via parse_blocks Vorarbeit).
Hybrid-Strategie v2.9 (Strict/Soft/Safety-Net).
"""
return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}")
strict = config.get("strict_heading_split", False)
target = config.get("target", 400)
max_tokens = config.get("max", 600)
split_level = config.get("split_level", 2)
chunks = []
current_buf = []
current_tokens = 0
def _flush(sec_title, sec_path):
nonlocal current_buf, current_tokens
if not current_buf: return
txt = "\n\n".join(current_buf)
win = f"# {doc_title}\n## {sec_title}\n{txt}".strip() if sec_title else txt
idx = len(chunks)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=sec_title, section_path=sec_path,
neighbors_prev=None, neighbors_next=None,
candidate_pool=[]
))
current_buf = []
current_tokens = 0
for b in blocks:
if b.kind == "heading":
# Hierarchie-Check: Split bei Überschriften oberhalb des Split-Levels
if b.level < split_level:
_flush(b.section_title, b.section_path)
elif b.level == split_level:
if strict or current_tokens >= target:
_flush(b.section_title, b.section_path)
continue
block_tokens = estimate_tokens(b.text)
if current_tokens + block_tokens > max_tokens and current_buf:
_flush(b.section_title, b.section_path)
current_buf.append(b.text)
current_tokens += block_tokens
if current_buf:
last = blocks[-1] if blocks else None
_flush(last.section_title if last else None, last.section_path if last else "/")
return chunks
# ==========================================
# 4. ROBUST EDGE PARSING & PROPAGATION (NEU)
# 4. ROBUST EDGE PARSING & PROPAGATION
# ==========================================
def _parse_edges_robust(text: str) -> Set[str]:
"""
NEU: Findet Kanten im Text, auch wenn sie mehrzeilig oder 'kaputt' formatiert sind.
Erkennt:
> [!edge] type
> [[Link]]
Returns: Set von Strings "kind:target"
Findet Kanten im Text (Wikilinks, Inlines, Callouts).
Fix V3: Support für mehrzeilige Callouts.
"""
found_edges = set()
# A. Inline [[rel:type|target]] (Standard)
# A. Inline [[rel:type|target]]
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
for kind, target in inlines:
k = kind.strip()
k = kind.strip().lower()
t = target.strip()
if k and t: found_edges.add(f"{k}:{t}")
# B. Multiline Callouts Parsing (Der Fix für dein Problem)
# B. Multiline Callouts Parsing (WP-15 Fix)
lines = text.split('\n')
current_edge_type = None
for line in lines:
stripped = line.strip()
# 1. Start Blockquote: > [!edge] type
# (Erlaubt optionalen Doppelpunkt)
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
if callout_match:
current_edge_type = callout_match.group(1).strip()
# Check: Sind Links noch in der GLEICHEN Zeile?
current_edge_type = callout_match.group(1).strip().lower()
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links:
if "rel:" not in l:
found_edges.add(f"{current_edge_type}:{l}")
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
continue
# 2. Continuation Line: > [[Target]]
# Wenn wir noch im 'edge mode' sind und die Zeile ein Zitat ist
if current_edge_type and stripped.startswith('>'):
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links:
if "rel:" not in l:
found_edges.add(f"{current_edge_type}:{l}")
# 3. End of Blockquote (kein '>') -> Reset Type
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
elif not stripped.startswith('>'):
current_edge_type = None
return found_edges
def _propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
def _propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]:
"""
NEU: Verteilt Kanten innerhalb einer Sektion.
Löst das Problem: Callout steht oben im Kapitel, gilt aber für alle Chunks darunter.
WP-15b: Implementiert Edge-Inheritance.
Kanten aus Überschriften werden an untergeordnete Chunks vererbt.
"""
# Step 1: Sammeln pro Sektion
section_map = {} # path -> set(kind:target)
section_inheritance: Dict[str, Set[str]] = {}
# 1. Sammeln aus den Heading-Blöcken
for b in blocks:
if b.kind == "heading":
edges = _parse_edges_robust(b.text)
if edges:
if b.section_path not in section_inheritance:
section_inheritance[b.section_path] = set()
section_inheritance[b.section_path].update(edges)
# 2. Injektion in den Candidate-Pool
for ch in chunks:
# Root-Level "/" ignorieren wir meist, da zu global
if not ch.section_path or ch.section_path == "/": continue
edges = _parse_edges_robust(ch.text)
if edges:
if ch.section_path not in section_map:
section_map[ch.section_path] = set()
section_map[ch.section_path].update(edges)
# Step 2: Injizieren (Broadcasting)
for ch in chunks:
if ch.section_path in section_map:
edges_to_add = section_map[ch.section_path]
if not edges_to_add: continue
injections = []
for e_str in edges_to_add:
kind, target = e_str.split(':', 1)
# Check: Kante schon im Text?
token = f"[[rel:{kind}|{target}]]"
if token not in ch.text:
injections.append(token)
if injections:
# Wir schreiben die Kanten "hart" in den Text.
# Damit findet sie derive_edges.py später garantiert.
block = "\n\n\n" + " ".join(injections)
ch.text += block
# Auch ins Window schreiben für Embedding-Kontext
ch.window += block
inherited = section_inheritance.get(ch.section_path, set())
for e_str in inherited:
kind, target = e_str.split(':', 1)
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"})
return chunks
# ==========================================
# 5. ORCHESTRATION (ASYNC)
# 5. ORCHESTRATION (WP-15b)
# ==========================================
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
"""
Hauptfunktion. Verbindet Parsing, Splitting und Edge-Allocation.
Hauptfunktion zur Chunk-Generierung.
Baut den Candidate-Pool für die semantische Validierung auf.
"""
# 1. Config laden (WP-15 Kompatibilität)
if config is None:
config = get_chunk_config(note_type)
fm, body_text = extract_frontmatter_from_text(md_text)
note_status = fm.get("status", "").lower()
primary_strategy = config.get("strategy", "sliding_window")
enable_smart_edges = config.get("enable_smart_edge_allocation", False)
# Drafts skippen LLM um Kosten/Zeit zu sparen
if enable_smart_edges and note_status in ["draft", "initial_gen"]:
logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.")
enable_smart_edges = False
# 2. Parsing & Splitting
# 1. Parsing & Splitting
blocks, doc_title = parse_blocks(md_text)
if primary_strategy == "by_heading":
@ -381,94 +403,45 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
else:
chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)
if not chunks:
return []
if not chunks: return []
# 3. NEU: Propagation VOR Smart Edge Allocation
# Das repariert die fehlenden Kanten aus deinen Callouts.
chunks = _propagate_section_edges(chunks)
# 2. WP-15b: Candidate Pool Vorbereitung
# A. Edge Inheritance (Sektions-Propagation)
chunks = _propagate_section_edges(chunks, blocks)
# B. Explicit Edges (Direkt im Chunk-Text enthalten)
for ch in chunks:
explicit = _parse_edges_robust(ch.text)
for e_str in explicit:
kind, target = e_str.split(':', 1)
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})
# 4. Smart Edges (LLM)
if enable_smart_edges:
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type)
# C. Global "Unassigned Pool" Detection (Safety Net)
# Sucht nach einer Sektion "Unzugeordnete Kanten" im Body
unassigned_pool = set()
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
if pool_match:
unassigned_pool = _parse_edges_robust(pool_match.group(1))
for ch in chunks:
for e_str in unassigned_pool:
kind, target = e_str.split(':', 1)
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})
# 5. Linking
# D. De-Duplikation des Pools
for ch in chunks:
seen = set()
unique_pool = []
for cand in ch.candidate_pool:
key = (cand["kind"], cand["to"])
if key not in seen:
seen.add(key)
unique_pool.append(cand)
ch.candidate_pool = unique_pool
# 3. Nachbarschafts-Verkettung (Struktur-Kanten)
for i, ch in enumerate(chunks):
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
return chunks
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
"""
Hilfsfunktion: Sammelt ALLE Kanten für den LLM-Kandidaten-Pool.
"""
# A. Via derive_edges (Standard)
dummy_chunk = {
"chunk_id": f"{note_id}#full",
"text": md_text,
"content": md_text,
"window": md_text,
"type": note_type
}
# Signatur-Anpassung beachten (WP-15 Fix)
raw_edges = build_edges_for_note(
note_id,
[dummy_chunk],
note_level_references=None,
include_note_scope_refs=False
)
all_candidates = set()
for e in raw_edges:
kind = e.get("kind")
target = e.get("target_id")
if target and kind not in ["belongs_to", "next", "prev", "backlink"]:
all_candidates.add(f"{kind}:{target}")
# B. Via Robust Parser (NEU) - fängt die multiline Callouts
robust_edges = _parse_edges_robust(md_text)
all_candidates.update(robust_edges)
return list(all_candidates)
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
"""
Der LLM-Schritt (WP-15). Filtert irrelevante Kanten.
"""
analyzer = get_semantic_analyzer()
candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type)
if not candidate_list:
return chunks
tasks = []
for chunk in chunks:
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
results_per_chunk = await asyncio.gather(*tasks)
assigned_edges_global = set()
for i, confirmed_edges in enumerate(results_per_chunk):
chunk = chunks[i]
chunk.suggested_edges = confirmed_edges
assigned_edges_global.update(confirmed_edges)
if confirmed_edges:
# Wir schreiben auch Smart Edges hart in den Text
injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e])
chunk.text += injection_str
chunk.window += injection_str
# Fallback für Kanten, die das LLM nirgendwo zugeordnet hat
# (Damit nichts verloren geht -> Safety Fallback)
unassigned = set(candidate_list) - assigned_edges_global
if unassigned:
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
for chunk in chunks:
chunk.text += fallback_str
chunk.window += fallback_str
if chunk.suggested_edges is None: chunk.suggested_edges = []
chunk.suggested_edges.extend(list(unassigned))
return chunks

View File

@ -1,17 +1,20 @@
"""
FILE: app/core/derive_edges.py
DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts.
VERSION: 2.0.0
WP-15b: Integration des Candidate-Pools und Provenance-Priorisierung.
Sichert die Graph-Integrität durch confidence-basiertes De-Duplicating.
VERSION: 2.1.0
STATUS: Active
DEPENDENCIES: re, os, yaml, typing
DEPENDENCIES: re, os, yaml, typing, hashlib
EXTERNAL_CONFIG: config/types.yaml
LAST_ANALYSIS: 2025-12-15
LAST_ANALYSIS: 2025-12-26
"""
from __future__ import annotations
import os
import re
import hashlib
from typing import Iterable, List, Optional, Tuple, Set, Dict
try:
@ -20,17 +23,18 @@ except Exception: # pragma: no cover
yaml = None
# --------------------------------------------------------------------------- #
# Utilities
# 1. Utilities & ID Generation
# --------------------------------------------------------------------------- #
def _get(d: dict, *keys, default=None):
"""Sicherer Zugriff auf verschachtelte Dictionary-Keys."""
for k in keys:
if isinstance(d, dict) and k in d and d[k] is not None:
return d[k]
return default
def _chunk_text_for_refs(chunk: dict) -> str:
# bevorzugt 'window' → dann 'text' → 'content' → 'raw'
"""Extrahiert den relevanten Text für die Referenzsuche (bevorzugt Window)."""
return (
_get(chunk, "window")
or _get(chunk, "text")
@ -40,6 +44,7 @@ def _chunk_text_for_refs(chunk: dict) -> str:
)
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
"""Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge."""
seen: Set[str] = set()
out: List[str] = []
for s in seq:
@ -49,9 +54,10 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]:
return out
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
"""Konstruiert ein valides Kanten-Payload-Objekt für Qdrant."""
pl = {
"kind": kind,
"relation": kind, # Alias (v2)
"relation": kind, # Alias für Abwärtskompatibilität (v2)
"scope": scope, # "chunk" | "note"
"source_id": source_id,
"target_id": target_id,
@ -62,25 +68,38 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
return pl
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
"""Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s."""
base = f"{kind}:{s}->{t}#{scope}"
if rule_id:
base += f"|{rule_id}"
try:
import hashlib
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
except Exception: # pragma: no cover
return base
# --------------------------------------------------------------------------- #
# Typen-Registry (types.yaml)
# 2. Konfiguration & Provenance-Skala
# --------------------------------------------------------------------------- #
# WP-15b: Prioritäten-Ranking für die De-Duplizierung
PROVENANCE_PRIORITY = {
"explicit:wikilink": 1.00,
"inline:rel": 0.95,
"callout:edge": 0.90,
"semantic_ai": 0.90, # Validierte KI-Kanten
"structure:belongs_to": 1.00,
"structure:order": 0.95, # next/prev
"explicit:note_scope": 1.00,
"derived:backlink": 0.90,
"edge_defaults": 0.70 # Heuristik (types.yaml)
}
def _env(n: str, default: Optional[str] = None) -> str:
v = os.getenv(n)
return v if v is not None else (default or "")
def _load_types_registry() -> dict:
"""Lädt die YAML-Registry aus MINDNET_TYPES_FILE oder ./config/types.yaml"""
"""Lädt die YAML-Registry zur Ermittlung von Standard-Kanten."""
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
if not os.path.isfile(p) or yaml is None:
return {}
@ -97,13 +116,7 @@ def _get_types_map(reg: dict) -> dict:
return reg if isinstance(reg, dict) else {}
def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
"""
Liefert die edge_defaults-Liste für den gegebenen Notiztyp.
Fallback-Reihenfolge:
1) reg['types'][note_type]['edge_defaults']
2) reg['defaults']['edge_defaults'] (oder 'default'/'global')
3) []
"""
"""Liefert die edge_defaults-Liste für den gegebenen Notiztyp."""
types_map = _get_types_map(reg)
if note_type and isinstance(types_map, dict):
t = types_map.get(note_type)
@ -116,29 +129,19 @@ def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
return []
# --------------------------------------------------------------------------- #
# Parser für Links / Relationen
# 3. Parser für Links / Relationen (Core Logik v2.0.0)
# --------------------------------------------------------------------------- #
# Normale Wikilinks (Fallback)
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
# Getypte Inline-Relationen:
# [[rel:KIND | Target]]
# [[rel:KIND Target]]
# Getypte Inline-Relationen
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
# rel: KIND [[Target]] (reines Textmuster)
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
"""
Gibt Liste (kind, target) zurück und den Text mit entfernten getypten Relation-Links,
damit die generische Wikilink-Erkennung sie nicht doppelt zählt.
Unterstützt drei Varianten:
- [[rel:KIND | Target]]
- [[rel:KIND Target]]
- rel: KIND [[Target]]
"""
"""Extrahiert [[rel:KIND|Target]] und entfernt sie zur Vermeidung von Dubletten."""
pairs: List[Tuple[str,str]] = []
def _collect(m):
k = (m.group("kind") or "").strip().lower()
@ -152,17 +155,13 @@ def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
text = _REL_TEXT.sub(_collect, text)
return pairs, text
# Obsidian Callout Parser
# Obsidian Callout Parser für mehrzeilige Blöcke
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
"""
Findet [!edge]-Callouts und extrahiert (kind, target). Entfernt den gesamten
Callout-Block aus dem Text (damit Wikilinks daraus nicht zusätzlich als
"references" gezählt werden).
"""
"""Verarbeitet [!edge]-Callouts und entfernt diese aus dem Textfluss."""
if not text:
return [], text
@ -205,21 +204,20 @@ def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
t = raw.strip()
if t:
out_pairs.append((kind, t))
# Callout wird NICHT in keep_lines übernommen
continue
remainder = "\n".join(keep_lines)
return out_pairs, remainder
def _extract_wikilinks(text: str) -> List[str]:
"""Extrahiert Standard-Wikilinks aus dem verbleibenden Text."""
ids: List[str] = []
for m in _WIKILINK_RE.finditer(text or ""):
ids.append(m.group(1).strip())
return ids
# --------------------------------------------------------------------------- #
# Hauptfunktion
# 4. Hauptfunktion (build_edges_for_note)
# --------------------------------------------------------------------------- #
def build_edges_for_note(
@ -229,24 +227,13 @@ def build_edges_for_note(
include_note_scope_refs: bool = False,
) -> List[dict]:
"""
Erzeugt Kanten für eine Note.
- belongs_to: für jeden Chunk (chunk -> note)
- next / prev: zwischen aufeinanderfolgenden Chunks
- references: pro Chunk aus window/text (via Wikilinks)
- typed inline relations: [[rel:KIND | Target]] / [[rel:KIND Target]] / rel: KIND [[Target]]
- Obsidian Callouts: > [!edge] KIND: [[Target]] [[Target2]]
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
- typenbasierte Default-Kanten (edge_defaults) je gefundener Referenz
Erzeugt und aggregiert alle Kanten für eine Note inklusive WP-15b Candidate-Processing.
Setzt Provenance-Ranking zur Graph-Stabilisierung ein.
"""
edges: List[dict] = []
note_type = _get(chunks[0], "type") if chunks else "concept"
# Note-Typ (aus erstem Chunk erwartet)
note_type = None
if chunks:
note_type = _get(chunks[0], "type")
# 1) belongs_to
# 1) Struktur-Kanten: belongs_to (Chunk -> Note)
for ch in chunks:
cid = _get(ch, "chunk_id", "id")
if not cid:
@ -254,12 +241,12 @@ def build_edges_for_note(
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"),
"provenance": "rule",
"provenance": "structure",
"rule_id": "structure:belongs_to",
"confidence": 1.0,
"confidence": PROVENANCE_PRIORITY["structure:belongs_to"],
}))
# 2) next / prev
# 2) Struktur-Kanten: next / prev (Sequenz)
for i in range(len(chunks) - 1):
a, b = chunks[i], chunks[i + 1]
a_id = _get(a, "chunk_id", "id")
@ -269,19 +256,19 @@ def build_edges_for_note(
edges.append(_edge("next", "chunk", a_id, b_id, note_id, {
"chunk_id": a_id,
"edge_id": _mk_edge_id("next", a_id, b_id, "chunk", "structure:order"),
"provenance": "rule",
"provenance": "structure",
"rule_id": "structure:order",
"confidence": 0.95,
"confidence": PROVENANCE_PRIORITY["structure:order"],
}))
edges.append(_edge("prev", "chunk", b_id, a_id, note_id, {
"chunk_id": b_id,
"edge_id": _mk_edge_id("prev", b_id, a_id, "chunk", "structure:order"),
"provenance": "rule",
"provenance": "structure",
"rule_id": "structure:order",
"confidence": 0.95,
"confidence": PROVENANCE_PRIORITY["structure:order"],
}))
# 3) references + typed inline + callouts + defaults (chunk-scope)
# 3) Inhaltliche Kanten (Refs, Inlines, Callouts, Candidates)
reg = _load_types_registry()
defaults = _edge_defaults_for(note_type, reg)
refs_all: List[str] = []
@ -292,51 +279,49 @@ def build_edges_for_note(
continue
raw = _chunk_text_for_refs(ch)
# 3a) typed inline relations
# 3a) Typed Inline Relations
typed, remainder = _extract_typed_relations(raw)
for kind, target in typed:
kind = kind.strip().lower()
if not kind or not target:
continue
edges.append(_edge(kind, "chunk", cid, target, note_id, {
k = kind.strip().lower()
if not k or not target: continue
edges.append(_edge(k, "chunk", cid, target, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(kind, cid, target, "chunk", "inline:rel"),
"edge_id": _mk_edge_id(k, cid, target, "chunk", "inline:rel"),
"provenance": "explicit",
"rule_id": "inline:rel",
"confidence": 0.95,
"confidence": PROVENANCE_PRIORITY["inline:rel"],
}))
if kind in {"related_to", "similar_to"}:
edges.append(_edge(kind, "chunk", target, cid, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(kind, target, cid, "chunk", "inline:rel"),
"provenance": "explicit",
"rule_id": "inline:rel",
"confidence": 0.95,
}))
# 3b) callouts
# 3b) WP-15b Candidate Pool Integration (KI-validierte Kanten)
# Verarbeitet Kanten, die bereits in der Ingestion semantisch geprüft wurden.
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
for cand in pool:
target = cand.get("to")
kind = cand.get("kind", "related_to")
prov = cand.get("provenance", "semantic_ai")
if not target: continue
edges.append(_edge(kind, "chunk", cid, target, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(kind, cid, target, "chunk", f"candidate:{prov}"),
"provenance": prov,
"rule_id": f"candidate:{prov}",
"confidence": PROVENANCE_PRIORITY.get(prov, 0.90),
}))
# 3c) Obsidian Callouts
call_pairs, remainder2 = _extract_callout_relations(remainder)
for kind, target in call_pairs:
k = (kind or "").strip().lower()
if not k or not target:
continue
if not k or not target: continue
edges.append(_edge(k, "chunk", cid, target, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge"),
"provenance": "explicit",
"rule_id": "callout:edge",
"confidence": 0.95,
"confidence": PROVENANCE_PRIORITY["callout:edge"],
}))
if k in {"related_to", "similar_to"}:
edges.append(_edge(k, "chunk", target, cid, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(k, target, cid, "chunk", "callout:edge"),
"provenance": "explicit",
"rule_id": "callout:edge",
"confidence": 0.95,
}))
# 3c) generische Wikilinks → references (+ defaults je Ref)
# 3d) Standard-Wikilinks -> references (+ defaults)
refs = _extract_wikilinks(remainder2)
for r in refs:
edges.append(_edge("references", "chunk", cid, r, note_id, {
@ -345,76 +330,65 @@ def build_edges_for_note(
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"),
"provenance": "explicit",
"rule_id": "explicit:wikilink",
"confidence": 1.0,
"confidence": PROVENANCE_PRIORITY["explicit:wikilink"],
}))
# Regelbasierte Kanten aus types.yaml anhängen
for rel in defaults:
if rel == "references":
continue
if rel == "references": continue
edges.append(_edge(rel, "chunk", cid, r, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7,
"confidence": PROVENANCE_PRIORITY["edge_defaults"],
}))
if rel in {"related_to", "similar_to"}:
edges.append(_edge(rel, "chunk", r, cid, note_id, {
"chunk_id": cid,
"edge_id": _mk_edge_id(rel, r, cid, "chunk", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7,
}))
refs_all.extend(refs)
# 4) optional note-scope refs/backlinks (+ defaults)
# 4) Optionale Note-Scope Referenzen & Backlinks
if include_note_scope_refs:
refs_note = list(refs_all or [])
if note_level_references:
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])
refs_note = _dedupe_seq(refs_note)
for r in refs_note:
edges.append(_edge("references", "note", note_id, r, note_id, {
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
"provenance": "explicit",
"rule_id": "explicit:note_scope",
"confidence": 1.0,
"confidence": PROVENANCE_PRIORITY["explicit:note_scope"],
}))
# Backlink-Erzeugung zur Graphen-Stärkung
edges.append(_edge("backlink", "note", r, note_id, note_id, {
"edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"),
"provenance": "rule",
"rule_id": "derived:backlink",
"confidence": 0.9,
"confidence": PROVENANCE_PRIORITY["derived:backlink"],
}))
for rel in defaults:
if rel == "references":
continue
if rel == "references": continue
edges.append(_edge(rel, "note", note_id, r, note_id, {
"edge_id": _mk_edge_id(rel, note_id, r, "note", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7,
"confidence": PROVENANCE_PRIORITY["edge_defaults"],
}))
if rel in {"related_to", "similar_to"}:
edges.append(_edge(rel, "note", r, note_id, note_id, {
"edge_id": _mk_edge_id(rel, r, note_id, "note", f"edge_defaults:{note_type}:{rel}"),
"provenance": "rule",
"rule_id": f"edge_defaults:{note_type}:{rel}",
"confidence": 0.7,
}))
# 5) De-Dupe (source_id, target_id, relation, rule_id)
seen: Set[Tuple[str,str,str,str]] = set()
out: List[dict] = []
# 5) WP-15b: Confidence-basierte De-Duplizierung
# Wenn dieselbe Relation mehrfach existiert, gewinnt die mit der höchsten Confidence.
unique_map: Dict[Tuple[str, str, str], dict] = {}
for e in edges:
s = str(e.get("source_id") or "")
t = str(e.get("target_id") or "")
s, t = str(e.get("source_id")), str(e.get("target_id"))
rel = str(e.get("relation") or e.get("kind") or "edge")
rule = str(e.get("rule_id") or "")
key = (s, t, rel, rule)
if key in seen:
continue
seen.add(key)
out.append(e)
return out
key = (s, t, rel)
if key not in unique_map:
unique_map[key] = e
else:
# Vergleich der Vertrauenswürdigkeit (Provenance Ranking)
if e.get("confidence", 0) > unique_map[key].get("confidence", 0):
unique_map[key] = e
return list(unique_map.values())

View File

@ -3,12 +3,12 @@ FILE: app/core/ingestion.py
DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen.
WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free).
WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash.
FIX: Deep Fallback Logic (v2.11.14). Erkennt Policy Violations auch in validen
JSON-Objekten und erzwingt den lokalen Ollama-Sprung, um Kantenverlust
bei umfangreichen Protokollen zu verhindern.
VERSION: 2.11.14
WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation.
FIX: Beibehaltung der Deep Fallback Logic (v2.11.14) zur JSON-Recovery.
VERSION: 2.12.0
STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker,
app.services.llm_service, app.services.edge_registry
"""
import os
import json
@ -21,9 +21,11 @@ from typing import Dict, List, Optional, Tuple, Any
# Core Module Imports
from app.core.parser import (
read_markdown,
pre_scan_markdown,
normalize_frontmatter,
validate_required_frontmatter,
extract_edges_with_context,
NoteContext
)
from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks, get_chunk_config
@ -49,7 +51,7 @@ from app.services.llm_service import LLMService
logger = logging.getLogger(__name__)
# --- Global Helpers ---
# --- Global Helpers (Full Compatibility v2.11.14) ---
def extract_json_from_response(text: str) -> Any:
"""
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
@ -115,6 +117,7 @@ class IngestionService:
self.llm = LLMService()
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
try:
ensure_collections(self.client, self.prefix, self.dim)
@ -122,6 +125,54 @@ class IngestionService:
except Exception as e:
logger.warning(f"DB init warning: {e}")
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
"""
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
Pass 1: Pre-Scan baut Kontext-Cache auf.
Pass 2: Processing führt semantische Validierung durch.
"""
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Batch Cache...")
for path in file_paths:
ctx = pre_scan_markdown(path)
if ctx:
self.batch_cache[ctx.note_id] = ctx
logger.info(f"🚀 [Pass 2] Processing {len(file_paths)} files...")
results = []
for path in file_paths:
res = await self.process_file(path, vault_root, apply=True)
results.append(res)
return results
async def _validate_candidate(self, chunk_text: str, edge: Dict) -> bool:
"""
WP-15b: Validiert einen Kanten-Kandidaten semantisch gegen das Ziel.
Nutzt den Cache aus Pass 1, um dem LLM Kontext der Ziel-Note zu geben.
"""
target_id = edge.get("to")
target_ctx = self.batch_cache.get(target_id)
# Falls Zielnotiz nicht im aktuellen Batch ist: 'explicit' durchlassen (Hard-Link Integrity)
if not target_ctx:
return True
provider = self.settings.MINDNET_LLM_PROVIDER
template = self.llm.get_prompt("edge_validation", provider)
try:
prompt = template.format(
chunk_text=chunk_text[:1500],
target_title=target_ctx.title,
target_summary=target_ctx.summary,
edge_kind=edge.get("kind", "related_to")
)
response = await self.llm.generate_raw_response(prompt, priority="background")
return "YES" in response.upper()
except Exception as e:
logger.warning(f"⚠️ Semantic validation error for {target_id}: {e}")
return True # Fallback: Im Zweifel Link behalten
def _resolve_note_type(self, requested: Optional[str]) -> str:
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
types = self.registry.get("types", {})
@ -138,109 +189,12 @@ class IngestionService:
return cfg
return get_chunk_config(note_type)
async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]:
"""
KI-Extraktion mit Deep-Fallback Logik.
Erzwingt den lokalen Ollama-Sprung, wenn die Cloud-Antwort keine verwertbaren
Kanten liefert (häufig bei Policy Violations auf OpenRouter).
"""
provider = self.settings.MINDNET_LLM_PROVIDER
model = self.settings.OPENROUTER_MODEL if provider == "openrouter" else self.settings.GEMINI_MODEL
logger.info(f"🚀 [Ingestion] Turbo-Mode: Extracting edges for '{note_id}' using {model} on {provider}")
edge_registry.ensure_latest()
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
template = self.llm.get_prompt("edge_extraction", provider)
try:
try:
# Wir begrenzen den Kontext auf 6000 Zeichen (ca. 1500 Token)
prompt = template.format(
text=text[:6000],
note_id=note_id,
valid_types=valid_types_str
)
except KeyError as ke:
logger.error(f"❌ [Ingestion] Prompt-Template Fehler (Variable {ke} fehlt).")
return []
# 1. Versuch: Anfrage an den primären Cloud-Provider
response_json = await self.llm.generate_raw_response(
prompt=prompt, priority="background", force_json=True,
provider=provider, model_override=model
)
# Initiales Parsing
raw_data = extract_json_from_response(response_json)
# 2. Dictionary Recovery (Versuche Liste aus Dict zu extrahieren)
candidates = []
if isinstance(raw_data, list):
candidates = raw_data
elif isinstance(raw_data, dict):
logger.info(f" [Ingestion] LLM returned dict, checking for embedded lists in {note_id}")
for k in ["edges", "links", "results", "kanten", "matches", "edge_list"]:
if k in raw_data and isinstance(raw_data[k], list):
candidates = raw_data[k]
break
# Wenn immer noch keine Liste gefunden, versuche Key-Value Paare (Dict Recovery)
if not candidates:
for k, v in raw_data.items():
if isinstance(v, str): candidates.append(f"{k}:{v}")
elif isinstance(v, list): [candidates.append(f"{k}:{i}") for i in v if isinstance(i, str)]
# 3. DEEP FALLBACK: Wenn nach allen Recovery-Versuchen die Liste leer ist UND wir in der Cloud waren
# Triggert den Fallback bei "Data Policy Violations" (leere oder Fehler-JSONs).
if not candidates and provider != "ollama" and self.settings.LLM_FALLBACK_ENABLED:
logger.warning(
f"🛑 [Ingestion] Cloud-Antwort für {note_id} lieferte keine verwertbaren Kanten. "
f"Mögliche Policy Violation oder Refusal. Erzwinge LOKALEN FALLBACK via Ollama..."
)
response_json_local = await self.llm.generate_raw_response(
prompt=prompt, priority="background", force_json=True, provider="ollama"
)
raw_data_local = extract_json_from_response(response_json_local)
# Wiederhole Recovery für lokale Antwort
if isinstance(raw_data_local, list):
candidates = raw_data_local
elif isinstance(raw_data_local, dict):
for k in ["edges", "links", "results"]:
if k in raw_data_local and isinstance(raw_data_local[k], list):
candidates = raw_data_local[k]; break
if not candidates:
logger.warning(f"⚠️ [Ingestion] Auch nach Fallback keine extrahierbaren Kanten für {note_id}")
return []
processed = []
for item in candidates:
if isinstance(item, dict) and "to" in item:
item["provenance"] = "semantic_ai"
item["line"] = f"ai-{provider}"
processed.append(item)
elif isinstance(item, str) and ":" in item:
parts = item.split(":", 1)
processed.append({
"to": parts[1].strip(),
"kind": parts[0].strip(),
"provenance": "semantic_ai",
"line": f"ai-{provider}"
})
return processed
except Exception as e:
logger.warning(f"⚠️ [Ingestion] Smart Edge Allocation failed for {note_id}: {e}")
return []
async def process_file(
self, file_path: str, vault_root: str,
force_replace: bool = False, apply: bool = False, purge_before: bool = False,
note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical"
) -> Dict[str, Any]:
"""Transformiert eine Markdown-Datei in den Graphen (Notes, Chunks, Edges)."""
"""Transformiert eine Markdown-Datei in den Graphen."""
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
# 1. Parse & Lifecycle Gate
@ -252,12 +206,12 @@ class IngestionService:
except Exception as e:
return {**result, "error": f"Validation failed: {str(e)}"}
# WP-22: Filter für Systemdateien und Entwürfe
# Lifecycle Filter (WP-22)
status = fm.get("status", "draft").lower().strip()
if status in ["system", "template", "archive", "hidden"]:
return {**result, "status": "skipped", "reason": f"lifecycle_{status}"}
# 2. Config Resolution & Payload Construction
# 2. Config Resolution & Payload
note_type = self._resolve_note_type(fm.get("type"))
fm["type"] = note_type
@ -267,15 +221,13 @@ class IngestionService:
except Exception as e:
return {**result, "error": f"Payload failed: {str(e)}"}
# 3. Change Detection (Strikte DoD Umsetzung)
# 3. Change Detection (v2.11.14 Logic)
old_payload = None if force_replace else self._fetch_note_payload(note_id)
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
new_hash = note_pl.get("hashes", {}).get(check_key)
# Prüfung auf fehlende Artefakte in Qdrant
chunks_missing, edges_missing = self._artifacts_missing(note_id)
should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing
if not should_write:
@ -284,40 +236,42 @@ class IngestionService:
if not apply:
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
# 4. Processing (Chunking, Embedding, AI Edges)
# 4. Processing (Chunking, Embedding, Validated Edges)
try:
body_text = getattr(parsed, "body", "") or ""
edge_registry.ensure_latest()
# Profil-gesteuertes Chunking
# Chunker Resolution
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
chunk_cfg = self._get_chunk_config_by_profile(profile, note_type)
chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg)
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
# Vektorisierung
# Embeddings
vecs = []
if chunk_pls:
texts = [c.get("window") or c.get("text") or "" for c in chunk_pls]
vecs = await self.embedder.embed_documents(texts)
# Kanten-Extraktion
# Kanten-Extraktion & WP-15b Validierung
edges = []
context = {"file": file_path, "note_id": note_id}
# A. Explizite Kanten (User / Wikilinks)
for e in extract_edges_with_context(parsed):
e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")})
edges.append(e)
# A. Explizite Kandidaten (Wikilinks)
raw_candidates = extract_edges_with_context(parsed)
for cand in raw_candidates:
# Semantische Prüfung gegen Pass 1 Cache
if await self._validate_candidate(body_text, cand):
cand["kind"] = edge_registry.resolve(
edge_type=cand["kind"],
provenance="explicit",
context={**context, "line": cand.get("line")}
)
edges.append(cand)
else:
logger.info(f"🚫 WP-15b: Candidate rejected: {cand['kind']} -> {cand['to']}")
# B. KI Kanten (Turbo Mode mit v2.11.14 Fallback)
ai_edges = await self._perform_smart_edge_allocation(body_text, note_id)
for e in ai_edges:
valid_kind = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")})
e["kind"] = valid_kind
edges.append(e)
# C. System Kanten (Struktur)
# B. System Kanten (Struktur)
try:
sys_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs)
except:

View File

@ -2,10 +2,11 @@
FILE: app/core/parser.py
DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry.
VERSION: 1.8.0
WP-15b: Implementierung NoteContext und pre_scan_markdown für Pass 1 Ingestion.
VERSION: 1.9.0
STATUS: Active
DEPENDENCIES: yaml, re, dataclasses, json, io, os
LAST_ANALYSIS: 2025-12-23
LAST_ANALYSIS: 2025-12-26
"""
from __future__ import annotations
@ -32,6 +33,15 @@ class ParsedNote:
body: str
path: str
@dataclass
class NoteContext:
"""Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1)."""
note_id: str
title: str
type: str
summary: str
tags: List[str]
# ---------------------------------------------------------------------
# Frontmatter-Erkennung
@ -152,6 +162,32 @@ def read_markdown(path: str) -> Optional[ParsedNote]:
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
def pre_scan_markdown(path: str) -> Optional[NoteContext]:
"""
WP-15b: Schneller Scan für den LocalBatchCache (Pass 1).
Extrahiert nur Identität und Kurz-Kontext zur semantischen Validierung.
"""
parsed = read_markdown(path)
if not parsed:
return None
fm = parsed.frontmatter
# ID-Findung: Frontmatter ID oder Dateiname als Fallback
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
# Erstelle Kurz-Zusammenfassung (erste 500 Zeichen des Body, bereinigt)
clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip()
summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body
return NoteContext(
note_id=note_id,
title=str(fm.get("title", note_id)),
type=str(fm.get("type", "concept")),
summary=summary,
tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else []
)
def validate_required_frontmatter(fm: Dict[str, Any],
required: Tuple[str, ...] = ("id", "title")) -> None:
"""

View File

@ -1,11 +1,14 @@
"""
FILE: app/services/edge_registry.py
DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload.
WP-15b: Erweiterte Provenance-Prüfung für die Candidate-Validation.
Sichert die Graph-Integrität durch strikte Trennung von System- und Inhaltskanten.
WP-22: Fix für absolute Pfade außerhalb des Vaults (Prod-Dictionary).
WP-20: Synchronisation mit zentralen Settings (v0.6.2).
VERSION: 0.7.5
VERSION: 0.8.0
STATUS: Active
DEPENDENCIES: re, os, json, logging, time, app.config
LAST_ANALYSIS: 2025-12-26
"""
import re
import os
@ -19,7 +22,12 @@ from app.config import get_settings
logger = logging.getLogger(__name__)
class EdgeRegistry:
"""
Zentraler Verwalter für das Kanten-Vokabular.
Implementiert das Singleton-Pattern für konsistente Validierung über alle Services.
"""
_instance = None
# System-Kanten, die nicht durch User oder KI gesetzt werden dürfen
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
def __new__(cls, *args, **kwargs):
@ -51,7 +59,7 @@ class EdgeRegistry:
def ensure_latest(self):
"""
Prüft den Zeitstempel der Vokabular-Datei und lädt bei Bedarf neu.
Verhindert den AttributeError in der Ingestion-Pipeline.
Verhindert Inkonsistenzen bei Laufzeit-Updates des Dictionaries.
"""
if not os.path.exists(self.full_vocab_path):
logger.error(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!")
@ -66,7 +74,10 @@ class EdgeRegistry:
logger.error(f"!!! [EDGE-REGISTRY] Error checking file time: {e}")
def _load_vocabulary(self):
"""Parst das Markdown-Wörterbuch und baut die Canonical-Map auf."""
"""
Parst das Markdown-Wörterbuch und baut die Canonical-Map auf.
Erkennt Tabellen-Strukturen und extrahiert fettgedruckte System-Typen.
"""
self.canonical_map.clear()
self.valid_types.clear()
@ -101,8 +112,8 @@ class EdgeRegistry:
def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str:
"""
Validiert einen Kanten-Typ gegen das Vokabular.
Loggt unbekannte Typen für die spätere manuelle Pflege.
WP-15b: Validiert einen Kanten-Typ gegen das Vokabular und prüft Berechtigungen.
Sichert, dass nur strukturelle Prozesse System-Kanten setzen dürfen.
"""
self.ensure_latest()
if not edge_type:
@ -112,20 +123,23 @@ class EdgeRegistry:
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
ctx = context or {}
# System-Kanten dürfen nicht manuell vergeben werden
if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
self._log_issue(clean_type, "forbidden_system_usage", ctx)
# WP-15b: System-Kanten dürfen weder manuell noch durch KI/Vererbung gesetzt werden.
# Nur Provenienz 'structure' (interne Prozesse) ist autorisiert.
# Wir blockieren hier alle Provenienzen außer 'structure'.
restricted_provenance = ["explicit", "semantic_ai", "inherited", "global_pool", "rule"]
if provenance in restricted_provenance and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
self._log_issue(clean_type, f"forbidden_usage_by_{provenance}", ctx)
return "related_to"
# System-Kanten sind nur bei struktureller Provenienz erlaubt
# System-Kanten sind NUR bei struktureller Provenienz erlaubt
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
return clean_type
# Mapping auf kanonischen Namen
# Mapping auf kanonischen Namen (Alias-Auflösung)
if clean_type in self.canonical_map:
return self.canonical_map[clean_type]
# Fallback und Logging
# Fallback und Logging unbekannter Typen für Admin-Review
self._log_issue(clean_type, "unknown_type", ctx)
return clean_type
@ -139,12 +153,13 @@ class EdgeRegistry:
"error": error_kind,
"file": ctx.get("file", "unknown"),
"line": ctx.get("line", "unknown"),
"note_id": ctx.get("note_id", "unknown")
"note_id": ctx.get("note_id", "unknown"),
"provenance": ctx.get("provenance", "unknown")
}
with open(self.unknown_log_path, "a", encoding="utf-8") as f:
f.write(json.dumps(entry) + "\n")
except Exception:
pass
# Singleton Export
# Singleton Export für systemweiten Zugriff
registry = EdgeRegistry()

View File

@ -1,6 +1,7 @@
# config/prompts.yaml — Final V2.5.5 (OpenRouter Hardening)
# config/prompts.yaml — Final V2.6.0 (WP-15b Candidate-Validation)
# WP-20: Optimierte Cloud-Templates zur Unterdrückung von Modell-Geschwätz.
# FIX: Explizite Verbote für Einleitungstexte zur Vermeidung von JSON-Parsing-Fehlern.
# WP-15b: Integration der binären edge_validation für den Two-Pass Workflow.
# OLLAMA: UNVERÄNDERT laut Benutzeranweisung.
system_prompt: |
@ -215,7 +216,7 @@ edge_extraction:
4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten.
BEISPIEL:
[[ {{"to": "Ziel-Konzept", "kind": "beziehungs_typ"}} ]]
[[ {{"to": "Ziel-Konzept", \"kind\": \"beziehungs_typ\"}} ]]
TEXT:
"""
@ -227,13 +228,46 @@ edge_extraction:
Analysiere '{note_id}'. Extrahiere semantische Beziehungen.
ERLAUBTE TYPEN: {valid_types}
TEXT: {text}
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to":"Ziel","kind":"typ"}}]]. Kein Text davor/danach. Wenn nichts: [].
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to\":\"Ziel\",\"kind\":\"typ\"}}]]. Kein Text davor/danach. Wenn nichts: [].
openrouter: |
TASK: Extrahiere semantische Relationen für '{note_id}'.
ERLAUBTE TYPEN: {valid_types}
TEXT: {text}
ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten.
FORMAT: [[{{"to":"Ziel-Begriff","kind":"typ"}}]]
FORMAT: [[{{"to\":\"Ziel-Begriff\",\"kind\":\"typ\"}}]]
STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen.
Wenn keine Relationen existieren, antworte NUR mit: []
OUTPUT:
OUTPUT:
# ---------------------------------------------------------
# 8. WP-15b: EDGE VALIDATION (Intent: VALIDATE)
# ---------------------------------------------------------
edge_validation:
gemini: |
Bewerte die semantische Validität dieser Verbindung im Wissensgraph.
KONTEXT DER QUELLE (Chunk):
"{chunk_text}"
ZIEL-NOTIZ: "{target_title}"
ZIEL-BESCHREIBUNG (Zusammenfassung):
"{target_summary}"
GEPLANTE RELATION: "{edge_kind}"
FRAGE: Bestätigt der Kontext der Quelle die Beziehung '{edge_kind}' zum Ziel?
REGEL: Antworte NUR mit 'YES' oder 'NO'. Keine Erklärungen oder Smalltalk.
openrouter: |
Verify semantic relation for graph construction.
Source Context: {chunk_text}
Target Note: {target_title}
Target Summary: {target_summary}
Proposed Relation: {edge_kind}
Instruction: Does the source context support this relation to the target?
Result: Respond ONLY with 'YES' or 'NO'.
ollama: |
Bewerte die semantische Korrektheit dieser Verbindung.
QUELLE: {chunk_text}
ZIEL: {target_title} ({target_summary})
BEZIEHUNG: {edge_kind}
Ist diese Verbindung valide? Antworte NUR mit YES oder NO.