From f6b2375d659332bb1c404ede5aa0f679c5c8a457 Mon Sep 17 00:00:00 2001 From: Lars Date: Fri, 26 Dec 2025 21:52:08 +0100 Subject: [PATCH 01/23] WP15b - Initial --- app/core/chunker.py | 341 ++++++++++++++++------------------ app/core/derive_edges.py | 226 ++++++++++------------ app/core/ingestion.py | 204 ++++++++------------ app/core/parser.py | 40 +++- app/services/edge_registry.py | 41 ++-- config/prompts.yaml | 44 ++++- 6 files changed, 441 insertions(+), 455 deletions(-) diff --git a/app/core/chunker.py b/app/core/chunker.py index 07b5f47..c77a43c 100644 --- a/app/core/chunker.py +++ b/app/core/chunker.py @@ -1,13 +1,16 @@ """ FILE: app/core/chunker.py DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings). - Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer. - FIX V3: Support für mehrzeilige Callouts und Section-Propagation. -VERSION: 3.1.0 (Full Compatibility Merge) + WP-15b: Implementiert Edge-Inheritance und Candidate-Pool Vorbereitung. + Zentralisiert die Kanten-Vorbereitung für die spätere binäre Validierung. + Bietet volle Unterstützung für Hybrid-Chunking (Strict/Soft/Safety-Net). +VERSION: 3.2.0 +STATUS: Active +DEPENDENCIES: re, math, yaml, pathlib, asyncio, logging """ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List, Dict, Optional, Tuple, Any, Set import re import math @@ -17,15 +20,18 @@ import asyncio import logging # Services -from app.services.semantic_analyzer import get_semantic_analyzer +# In WP-15b wird die KI-Validierung in die ingestion.py verlagert. +# Wir behalten den Import für Abwärtskompatibilität, falls Legacy-Skripte ihn benötigen. +try: + from app.services.semantic_analyzer import get_semantic_analyzer +except ImportError: + def get_semantic_analyzer(): return None # Core Imports -# Wir importieren build_edges_for_note nur, um kompatibel zur Signatur zu bleiben -# oder für den Fallback. try: from app.core.derive_edges import build_edges_for_note except ImportError: - # Mock für Tests + # Fallback für Standalone-Betrieb oder Tests def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return [] logger = logging.getLogger(__name__) @@ -54,7 +60,7 @@ def _load_yaml_config() -> Dict[str, Any]: def get_chunk_config(note_type: str) -> Dict[str, Any]: """ Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml. - Dies sichert die Kompatibilität zu WP-15 (Profile). + Sichert die Kompatibilität zu WP-15 Profilen. """ full_config = _load_yaml_config() profiles = full_config.get("chunking_profiles", {}) @@ -75,6 +81,7 @@ def get_chunk_config(note_type: str) -> Dict[str, Any]: return config def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]: + """Trennt YAML-Frontmatter vom eigentlichen Text.""" fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL) if not fm_match: return {}, md_text try: @@ -89,12 +96,15 @@ def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]: # 2. DATA CLASSES & TEXT TOOLS # ========================================== -_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])'); _WS = re.compile(r'\s+') +_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])') +_WS = re.compile(r'\s+') def estimate_tokens(text: str) -> int: + """Grobe Schätzung der Token-Anzahl (4 Zeichen pro Token).""" return max(1, math.ceil(len(text.strip()) / 4)) def split_sentences(text: str) -> list[str]: + """Teilt Text in Sätze auf unter Berücksichtigung von Interpunktion.""" text = _WS.sub(' ', text.strip()) if not text: return [] parts = _SENT_SPLIT.split(text) @@ -102,13 +112,26 @@ def split_sentences(text: str) -> list[str]: @dataclass class RawBlock: - kind: str; text: str; level: Optional[int]; section_path: str; section_title: Optional[str] + kind: str + text: str + level: Optional[int] + section_path: str + section_title: Optional[str] @dataclass class Chunk: - id: str; note_id: str; index: int; text: str; window: str; token_count: int - section_title: Optional[str]; section_path: str - neighbors_prev: Optional[str]; neighbors_next: Optional[str] + id: str + note_id: str + index: int + text: str + window: str + token_count: int + section_title: Optional[str] + section_path: str + neighbors_prev: Optional[str] + neighbors_next: Optional[str] + # WP-15b: Liste von Kandidaten für die semantische Validierung + candidate_pool: List[Dict[str, Any]] = field(default_factory=list) suggested_edges: Optional[List[str]] = None # ========================================== @@ -118,7 +141,7 @@ class Chunk: def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: """ Zerlegt Text in logische Blöcke (Absätze, Header). - Wichtig für die Strategie 'by_heading'. + Wichtig für die Strategie 'by_heading' und die Edge-Inheritance. """ blocks = [] h1_title = "Dokument" @@ -165,14 +188,15 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]: """ - Die Standard-Strategie aus WP-15. - Fasst Blöcke zusammen und schneidet bei 'target' Tokens (mit Satz-Rücksicht). + Standard-Strategie aus WP-15. + Fasst Blöcke zusammen und schneidet bei 'target' Tokens. """ target = config.get("target", 400) max_tokens = config.get("max", 600) overlap_val = config.get("overlap", (50, 80)) overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val - chunks = []; buf = [] + chunks = [] + buf = [] def _create_chunk(txt, win, sec, path): idx = len(chunks) @@ -180,7 +204,7 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None, - suggested_edges=[] + candidate_pool=[] )) def flush_buffer(): @@ -190,14 +214,11 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not text_body = "\n\n".join([b.text for b in buf]) sec_title = buf[-1].section_title if buf else None sec_path = buf[-1].section_path if buf else "/" - - # Context Prefix (z.B. H1) voranstellen für Embedding-Qualität win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body if estimate_tokens(text_body) <= max_tokens: _create_chunk(text_body, win_body, sec_title, sec_path) else: - # Zu groß -> Satzweiser Split sentences = split_sentences(text_body) current_chunk_sents = [] current_len = 0 @@ -209,15 +230,13 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt _create_chunk(c_txt, c_win, sec_title, sec_path) - # Overlap für nächsten Chunk overlap_sents = [] ov_len = 0 for s in reversed(current_chunk_sents): if ov_len + estimate_tokens(s) < overlap: overlap_sents.insert(0, s) ov_len += estimate_tokens(s) - else: - break + else: break current_chunk_sents = list(overlap_sents) current_chunk_sents.append(sent) @@ -226,12 +245,10 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not current_chunk_sents.append(sent) current_len += sent_len - # Rest if current_chunk_sents: c_txt = " ".join(current_chunk_sents) c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt _create_chunk(c_txt, c_win, sec_title, sec_path) - buf = [] for b in blocks: @@ -248,132 +265,137 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]: """ - Strategie für strukturierte Daten (Profile, Werte). - Nutzt sliding_window, forciert aber Schnitte an Headings (via parse_blocks Vorarbeit). + Hybrid-Strategie v2.9 (Strict/Soft/Safety-Net). """ - return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}") + strict = config.get("strict_heading_split", False) + target = config.get("target", 400) + max_tokens = config.get("max", 600) + split_level = config.get("split_level", 2) + + chunks = [] + current_buf = [] + current_tokens = 0 + + def _flush(sec_title, sec_path): + nonlocal current_buf, current_tokens + if not current_buf: return + txt = "\n\n".join(current_buf) + win = f"# {doc_title}\n## {sec_title}\n{txt}".strip() if sec_title else txt + idx = len(chunks) + chunks.append(Chunk( + id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, + text=txt, window=win, token_count=estimate_tokens(txt), + section_title=sec_title, section_path=sec_path, + neighbors_prev=None, neighbors_next=None, + candidate_pool=[] + )) + current_buf = [] + current_tokens = 0 + + for b in blocks: + if b.kind == "heading": + # Hierarchie-Check: Split bei Überschriften oberhalb des Split-Levels + if b.level < split_level: + _flush(b.section_title, b.section_path) + elif b.level == split_level: + if strict or current_tokens >= target: + _flush(b.section_title, b.section_path) + continue + + block_tokens = estimate_tokens(b.text) + if current_tokens + block_tokens > max_tokens and current_buf: + _flush(b.section_title, b.section_path) + + current_buf.append(b.text) + current_tokens += block_tokens + + if current_buf: + last = blocks[-1] if blocks else None + _flush(last.section_title if last else None, last.section_path if last else "/") + + return chunks # ========================================== -# 4. ROBUST EDGE PARSING & PROPAGATION (NEU) +# 4. ROBUST EDGE PARSING & PROPAGATION # ========================================== def _parse_edges_robust(text: str) -> Set[str]: """ - NEU: Findet Kanten im Text, auch wenn sie mehrzeilig oder 'kaputt' formatiert sind. - Erkennt: - > [!edge] type - > [[Link]] - Returns: Set von Strings "kind:target" + Findet Kanten im Text (Wikilinks, Inlines, Callouts). + Fix V3: Support für mehrzeilige Callouts. """ found_edges = set() - # A. Inline [[rel:type|target]] (Standard) + # A. Inline [[rel:type|target]] inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text) for kind, target in inlines: - k = kind.strip() + k = kind.strip().lower() t = target.strip() if k and t: found_edges.add(f"{k}:{t}") - # B. Multiline Callouts Parsing (Der Fix für dein Problem) + # B. Multiline Callouts Parsing (WP-15 Fix) lines = text.split('\n') current_edge_type = None - for line in lines: stripped = line.strip() - - # 1. Start Blockquote: > [!edge] type - # (Erlaubt optionalen Doppelpunkt) callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped) if callout_match: - current_edge_type = callout_match.group(1).strip() - - # Check: Sind Links noch in der GLEICHEN Zeile? + current_edge_type = callout_match.group(1).strip().lower() links = re.findall(r'\[\[([^\]]+)\]\]', stripped) for l in links: - if "rel:" not in l: - found_edges.add(f"{current_edge_type}:{l}") + if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") continue - # 2. Continuation Line: > [[Target]] - # Wenn wir noch im 'edge mode' sind und die Zeile ein Zitat ist if current_edge_type and stripped.startswith('>'): links = re.findall(r'\[\[([^\]]+)\]\]', stripped) for l in links: - if "rel:" not in l: - found_edges.add(f"{current_edge_type}:{l}") - - # 3. End of Blockquote (kein '>') -> Reset Type + if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") elif not stripped.startswith('>'): current_edge_type = None return found_edges -def _propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]: +def _propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]: """ - NEU: Verteilt Kanten innerhalb einer Sektion. - Löst das Problem: Callout steht oben im Kapitel, gilt aber für alle Chunks darunter. + WP-15b: Implementiert Edge-Inheritance. + Kanten aus Überschriften werden an untergeordnete Chunks vererbt. """ - # Step 1: Sammeln pro Sektion - section_map = {} # path -> set(kind:target) + section_inheritance: Dict[str, Set[str]] = {} + # 1. Sammeln aus den Heading-Blöcken + for b in blocks: + if b.kind == "heading": + edges = _parse_edges_robust(b.text) + if edges: + if b.section_path not in section_inheritance: + section_inheritance[b.section_path] = set() + section_inheritance[b.section_path].update(edges) + + # 2. Injektion in den Candidate-Pool for ch in chunks: - # Root-Level "/" ignorieren wir meist, da zu global - if not ch.section_path or ch.section_path == "/": continue - - edges = _parse_edges_robust(ch.text) - if edges: - if ch.section_path not in section_map: - section_map[ch.section_path] = set() - section_map[ch.section_path].update(edges) - - # Step 2: Injizieren (Broadcasting) - for ch in chunks: - if ch.section_path in section_map: - edges_to_add = section_map[ch.section_path] - if not edges_to_add: continue - - injections = [] - for e_str in edges_to_add: - kind, target = e_str.split(':', 1) - # Check: Kante schon im Text? - token = f"[[rel:{kind}|{target}]]" - if token not in ch.text: - injections.append(token) - - if injections: - # Wir schreiben die Kanten "hart" in den Text. - # Damit findet sie derive_edges.py später garantiert. - block = "\n\n\n" + " ".join(injections) - ch.text += block - # Auch ins Window schreiben für Embedding-Kontext - ch.window += block + inherited = section_inheritance.get(ch.section_path, set()) + for e_str in inherited: + kind, target = e_str.split(':', 1) + ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"}) return chunks # ========================================== -# 5. ORCHESTRATION (ASYNC) +# 5. ORCHESTRATION (WP-15b) # ========================================== async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: """ - Hauptfunktion. Verbindet Parsing, Splitting und Edge-Allocation. + Hauptfunktion zur Chunk-Generierung. + Baut den Candidate-Pool für die semantische Validierung auf. """ - # 1. Config laden (WP-15 Kompatibilität) if config is None: config = get_chunk_config(note_type) fm, body_text = extract_frontmatter_from_text(md_text) - note_status = fm.get("status", "").lower() - primary_strategy = config.get("strategy", "sliding_window") - enable_smart_edges = config.get("enable_smart_edge_allocation", False) - # Drafts skippen LLM um Kosten/Zeit zu sparen - if enable_smart_edges and note_status in ["draft", "initial_gen"]: - logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.") - enable_smart_edges = False - - # 2. Parsing & Splitting + # 1. Parsing & Splitting blocks, doc_title = parse_blocks(md_text) if primary_strategy == "by_heading": @@ -381,94 +403,45 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op else: chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title) - if not chunks: - return [] + if not chunks: return [] - # 3. NEU: Propagation VOR Smart Edge Allocation - # Das repariert die fehlenden Kanten aus deinen Callouts. - chunks = _propagate_section_edges(chunks) + # 2. WP-15b: Candidate Pool Vorbereitung + + # A. Edge Inheritance (Sektions-Propagation) + chunks = _propagate_section_edges(chunks, blocks) + + # B. Explicit Edges (Direkt im Chunk-Text enthalten) + for ch in chunks: + explicit = _parse_edges_robust(ch.text) + for e_str in explicit: + kind, target = e_str.split(':', 1) + ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"}) - # 4. Smart Edges (LLM) - if enable_smart_edges: - chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type) + # C. Global "Unassigned Pool" Detection (Safety Net) + # Sucht nach einer Sektion "Unzugeordnete Kanten" im Body + unassigned_pool = set() + pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) + if pool_match: + unassigned_pool = _parse_edges_robust(pool_match.group(1)) + for ch in chunks: + for e_str in unassigned_pool: + kind, target = e_str.split(':', 1) + ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"}) - # 5. Linking + # D. De-Duplikation des Pools + for ch in chunks: + seen = set() + unique_pool = [] + for cand in ch.candidate_pool: + key = (cand["kind"], cand["to"]) + if key not in seen: + seen.add(key) + unique_pool.append(cand) + ch.candidate_pool = unique_pool + + # 3. Nachbarschafts-Verkettung (Struktur-Kanten) for i, ch in enumerate(chunks): ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None - return chunks - -def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]: - """ - Hilfsfunktion: Sammelt ALLE Kanten für den LLM-Kandidaten-Pool. - """ - # A. Via derive_edges (Standard) - dummy_chunk = { - "chunk_id": f"{note_id}#full", - "text": md_text, - "content": md_text, - "window": md_text, - "type": note_type - } - # Signatur-Anpassung beachten (WP-15 Fix) - raw_edges = build_edges_for_note( - note_id, - [dummy_chunk], - note_level_references=None, - include_note_scope_refs=False - ) - all_candidates = set() - for e in raw_edges: - kind = e.get("kind") - target = e.get("target_id") - if target and kind not in ["belongs_to", "next", "prev", "backlink"]: - all_candidates.add(f"{kind}:{target}") - - # B. Via Robust Parser (NEU) - fängt die multiline Callouts - robust_edges = _parse_edges_robust(md_text) - all_candidates.update(robust_edges) - - return list(all_candidates) - -async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]: - """ - Der LLM-Schritt (WP-15). Filtert irrelevante Kanten. - """ - analyzer = get_semantic_analyzer() - candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type) - - if not candidate_list: - return chunks - - tasks = [] - for chunk in chunks: - tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type)) - - results_per_chunk = await asyncio.gather(*tasks) - - assigned_edges_global = set() - - for i, confirmed_edges in enumerate(results_per_chunk): - chunk = chunks[i] - chunk.suggested_edges = confirmed_edges - assigned_edges_global.update(confirmed_edges) - - if confirmed_edges: - # Wir schreiben auch Smart Edges hart in den Text - injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e]) - chunk.text += injection_str - chunk.window += injection_str - - # Fallback für Kanten, die das LLM nirgendwo zugeordnet hat - # (Damit nichts verloren geht -> Safety Fallback) - unassigned = set(candidate_list) - assigned_edges_global - if unassigned: - fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e]) - for chunk in chunks: - chunk.text += fallback_str - chunk.window += fallback_str - if chunk.suggested_edges is None: chunk.suggested_edges = [] - chunk.suggested_edges.extend(list(unassigned)) - return chunks \ No newline at end of file diff --git a/app/core/derive_edges.py b/app/core/derive_edges.py index 96e0ad0..31204c9 100644 --- a/app/core/derive_edges.py +++ b/app/core/derive_edges.py @@ -1,17 +1,20 @@ """ FILE: app/core/derive_edges.py DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts. -VERSION: 2.0.0 + WP-15b: Integration des Candidate-Pools und Provenance-Priorisierung. + Sichert die Graph-Integrität durch confidence-basiertes De-Duplicating. +VERSION: 2.1.0 STATUS: Active -DEPENDENCIES: re, os, yaml, typing +DEPENDENCIES: re, os, yaml, typing, hashlib EXTERNAL_CONFIG: config/types.yaml -LAST_ANALYSIS: 2025-12-15 +LAST_ANALYSIS: 2025-12-26 """ from __future__ import annotations import os import re +import hashlib from typing import Iterable, List, Optional, Tuple, Set, Dict try: @@ -20,17 +23,18 @@ except Exception: # pragma: no cover yaml = None # --------------------------------------------------------------------------- # -# Utilities +# 1. Utilities & ID Generation # --------------------------------------------------------------------------- # def _get(d: dict, *keys, default=None): + """Sicherer Zugriff auf verschachtelte Dictionary-Keys.""" for k in keys: if isinstance(d, dict) and k in d and d[k] is not None: return d[k] return default def _chunk_text_for_refs(chunk: dict) -> str: - # bevorzugt 'window' → dann 'text' → 'content' → 'raw' + """Extrahiert den relevanten Text für die Referenzsuche (bevorzugt Window).""" return ( _get(chunk, "window") or _get(chunk, "text") @@ -40,6 +44,7 @@ def _chunk_text_for_refs(chunk: dict) -> str: ) def _dedupe_seq(seq: Iterable[str]) -> List[str]: + """Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge.""" seen: Set[str] = set() out: List[str] = [] for s in seq: @@ -49,9 +54,10 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]: return out def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: + """Konstruiert ein valides Kanten-Payload-Objekt für Qdrant.""" pl = { "kind": kind, - "relation": kind, # Alias (v2) + "relation": kind, # Alias für Abwärtskompatibilität (v2) "scope": scope, # "chunk" | "note" "source_id": source_id, "target_id": target_id, @@ -62,25 +68,38 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e return pl def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: + """Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.""" base = f"{kind}:{s}->{t}#{scope}" if rule_id: base += f"|{rule_id}" try: - import hashlib return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() except Exception: # pragma: no cover return base # --------------------------------------------------------------------------- # -# Typen-Registry (types.yaml) +# 2. Konfiguration & Provenance-Skala # --------------------------------------------------------------------------- # +# WP-15b: Prioritäten-Ranking für die De-Duplizierung +PROVENANCE_PRIORITY = { + "explicit:wikilink": 1.00, + "inline:rel": 0.95, + "callout:edge": 0.90, + "semantic_ai": 0.90, # Validierte KI-Kanten + "structure:belongs_to": 1.00, + "structure:order": 0.95, # next/prev + "explicit:note_scope": 1.00, + "derived:backlink": 0.90, + "edge_defaults": 0.70 # Heuristik (types.yaml) +} + def _env(n: str, default: Optional[str] = None) -> str: v = os.getenv(n) return v if v is not None else (default or "") def _load_types_registry() -> dict: - """Lädt die YAML-Registry aus MINDNET_TYPES_FILE oder ./config/types.yaml""" + """Lädt die YAML-Registry zur Ermittlung von Standard-Kanten.""" p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") if not os.path.isfile(p) or yaml is None: return {} @@ -97,13 +116,7 @@ def _get_types_map(reg: dict) -> dict: return reg if isinstance(reg, dict) else {} def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]: - """ - Liefert die edge_defaults-Liste für den gegebenen Notiztyp. - Fallback-Reihenfolge: - 1) reg['types'][note_type]['edge_defaults'] - 2) reg['defaults']['edge_defaults'] (oder 'default'/'global') - 3) [] - """ + """Liefert die edge_defaults-Liste für den gegebenen Notiztyp.""" types_map = _get_types_map(reg) if note_type and isinstance(types_map, dict): t = types_map.get(note_type) @@ -116,29 +129,19 @@ def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]: return [] # --------------------------------------------------------------------------- # -# Parser für Links / Relationen +# 3. Parser für Links / Relationen (Core Logik v2.0.0) # --------------------------------------------------------------------------- # # Normale Wikilinks (Fallback) _WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") -# Getypte Inline-Relationen: -# [[rel:KIND | Target]] -# [[rel:KIND Target]] +# Getypte Inline-Relationen _REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) _REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) -# rel: KIND [[Target]] (reines Textmuster) _REL_TEXT = re.compile(r"rel\s*:\s*(?P[a-z_]+)\s*\[\[\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: - """ - Gibt Liste (kind, target) zurück und den Text mit entfernten getypten Relation-Links, - damit die generische Wikilink-Erkennung sie nicht doppelt zählt. - Unterstützt drei Varianten: - - [[rel:KIND | Target]] - - [[rel:KIND Target]] - - rel: KIND [[Target]] - """ + """Extrahiert [[rel:KIND|Target]] und entfernt sie zur Vermeidung von Dubletten.""" pairs: List[Tuple[str,str]] = [] def _collect(m): k = (m.group("kind") or "").strip().lower() @@ -152,17 +155,13 @@ def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: text = _REL_TEXT.sub(_collect, text) return pairs, text -# Obsidian Callout Parser +# Obsidian Callout Parser für mehrzeilige Blöcke _CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) _REL_LINE = re.compile(r"^(?P[a-z_]+)\s*:\s*(?P.+?)\s*$", re.IGNORECASE) _WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: - """ - Findet [!edge]-Callouts und extrahiert (kind, target). Entfernt den gesamten - Callout-Block aus dem Text (damit Wikilinks daraus nicht zusätzlich als - "references" gezählt werden). - """ + """Verarbeitet [!edge]-Callouts und entfernt diese aus dem Textfluss.""" if not text: return [], text @@ -205,21 +204,20 @@ def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: t = raw.strip() if t: out_pairs.append((kind, t)) - - # Callout wird NICHT in keep_lines übernommen continue remainder = "\n".join(keep_lines) return out_pairs, remainder def _extract_wikilinks(text: str) -> List[str]: + """Extrahiert Standard-Wikilinks aus dem verbleibenden Text.""" ids: List[str] = [] for m in _WIKILINK_RE.finditer(text or ""): ids.append(m.group(1).strip()) return ids # --------------------------------------------------------------------------- # -# Hauptfunktion +# 4. Hauptfunktion (build_edges_for_note) # --------------------------------------------------------------------------- # def build_edges_for_note( @@ -229,24 +227,13 @@ def build_edges_for_note( include_note_scope_refs: bool = False, ) -> List[dict]: """ - Erzeugt Kanten für eine Note. - - - belongs_to: für jeden Chunk (chunk -> note) - - next / prev: zwischen aufeinanderfolgenden Chunks - - references: pro Chunk aus window/text (via Wikilinks) - - typed inline relations: [[rel:KIND | Target]] / [[rel:KIND Target]] / rel: KIND [[Target]] - - Obsidian Callouts: > [!edge] KIND: [[Target]] [[Target2]] - - optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references - - typenbasierte Default-Kanten (edge_defaults) je gefundener Referenz + Erzeugt und aggregiert alle Kanten für eine Note inklusive WP-15b Candidate-Processing. + Setzt Provenance-Ranking zur Graph-Stabilisierung ein. """ edges: List[dict] = [] + note_type = _get(chunks[0], "type") if chunks else "concept" - # Note-Typ (aus erstem Chunk erwartet) - note_type = None - if chunks: - note_type = _get(chunks[0], "type") - - # 1) belongs_to + # 1) Struktur-Kanten: belongs_to (Chunk -> Note) for ch in chunks: cid = _get(ch, "chunk_id", "id") if not cid: @@ -254,12 +241,12 @@ def build_edges_for_note( edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, { "chunk_id": cid, "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"), - "provenance": "rule", + "provenance": "structure", "rule_id": "structure:belongs_to", - "confidence": 1.0, + "confidence": PROVENANCE_PRIORITY["structure:belongs_to"], })) - # 2) next / prev + # 2) Struktur-Kanten: next / prev (Sequenz) for i in range(len(chunks) - 1): a, b = chunks[i], chunks[i + 1] a_id = _get(a, "chunk_id", "id") @@ -269,19 +256,19 @@ def build_edges_for_note( edges.append(_edge("next", "chunk", a_id, b_id, note_id, { "chunk_id": a_id, "edge_id": _mk_edge_id("next", a_id, b_id, "chunk", "structure:order"), - "provenance": "rule", + "provenance": "structure", "rule_id": "structure:order", - "confidence": 0.95, + "confidence": PROVENANCE_PRIORITY["structure:order"], })) edges.append(_edge("prev", "chunk", b_id, a_id, note_id, { "chunk_id": b_id, "edge_id": _mk_edge_id("prev", b_id, a_id, "chunk", "structure:order"), - "provenance": "rule", + "provenance": "structure", "rule_id": "structure:order", - "confidence": 0.95, + "confidence": PROVENANCE_PRIORITY["structure:order"], })) - # 3) references + typed inline + callouts + defaults (chunk-scope) + # 3) Inhaltliche Kanten (Refs, Inlines, Callouts, Candidates) reg = _load_types_registry() defaults = _edge_defaults_for(note_type, reg) refs_all: List[str] = [] @@ -292,51 +279,49 @@ def build_edges_for_note( continue raw = _chunk_text_for_refs(ch) - # 3a) typed inline relations + # 3a) Typed Inline Relations typed, remainder = _extract_typed_relations(raw) for kind, target in typed: - kind = kind.strip().lower() - if not kind or not target: - continue - edges.append(_edge(kind, "chunk", cid, target, note_id, { + k = kind.strip().lower() + if not k or not target: continue + edges.append(_edge(k, "chunk", cid, target, note_id, { "chunk_id": cid, - "edge_id": _mk_edge_id(kind, cid, target, "chunk", "inline:rel"), + "edge_id": _mk_edge_id(k, cid, target, "chunk", "inline:rel"), "provenance": "explicit", "rule_id": "inline:rel", - "confidence": 0.95, + "confidence": PROVENANCE_PRIORITY["inline:rel"], })) - if kind in {"related_to", "similar_to"}: - edges.append(_edge(kind, "chunk", target, cid, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(kind, target, cid, "chunk", "inline:rel"), - "provenance": "explicit", - "rule_id": "inline:rel", - "confidence": 0.95, - })) - # 3b) callouts + # 3b) WP-15b Candidate Pool Integration (KI-validierte Kanten) + # Verarbeitet Kanten, die bereits in der Ingestion semantisch geprüft wurden. + pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] + for cand in pool: + target = cand.get("to") + kind = cand.get("kind", "related_to") + prov = cand.get("provenance", "semantic_ai") + if not target: continue + edges.append(_edge(kind, "chunk", cid, target, note_id, { + "chunk_id": cid, + "edge_id": _mk_edge_id(kind, cid, target, "chunk", f"candidate:{prov}"), + "provenance": prov, + "rule_id": f"candidate:{prov}", + "confidence": PROVENANCE_PRIORITY.get(prov, 0.90), + })) + + # 3c) Obsidian Callouts call_pairs, remainder2 = _extract_callout_relations(remainder) for kind, target in call_pairs: k = (kind or "").strip().lower() - if not k or not target: - continue + if not k or not target: continue edges.append(_edge(k, "chunk", cid, target, note_id, { "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge"), "provenance": "explicit", "rule_id": "callout:edge", - "confidence": 0.95, + "confidence": PROVENANCE_PRIORITY["callout:edge"], })) - if k in {"related_to", "similar_to"}: - edges.append(_edge(k, "chunk", target, cid, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(k, target, cid, "chunk", "callout:edge"), - "provenance": "explicit", - "rule_id": "callout:edge", - "confidence": 0.95, - })) - # 3c) generische Wikilinks → references (+ defaults je Ref) + # 3d) Standard-Wikilinks -> references (+ defaults) refs = _extract_wikilinks(remainder2) for r in refs: edges.append(_edge("references", "chunk", cid, r, note_id, { @@ -345,76 +330,65 @@ def build_edges_for_note( "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"), "provenance": "explicit", "rule_id": "explicit:wikilink", - "confidence": 1.0, + "confidence": PROVENANCE_PRIORITY["explicit:wikilink"], })) + # Regelbasierte Kanten aus types.yaml anhängen for rel in defaults: - if rel == "references": - continue + if rel == "references": continue edges.append(_edge(rel, "chunk", cid, r, note_id, { "chunk_id": cid, "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{note_type}:{rel}"), "provenance": "rule", "rule_id": f"edge_defaults:{note_type}:{rel}", - "confidence": 0.7, + "confidence": PROVENANCE_PRIORITY["edge_defaults"], })) - if rel in {"related_to", "similar_to"}: - edges.append(_edge(rel, "chunk", r, cid, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(rel, r, cid, "chunk", f"edge_defaults:{note_type}:{rel}"), - "provenance": "rule", - "rule_id": f"edge_defaults:{note_type}:{rel}", - "confidence": 0.7, - })) refs_all.extend(refs) - # 4) optional note-scope refs/backlinks (+ defaults) + # 4) Optionale Note-Scope Referenzen & Backlinks if include_note_scope_refs: refs_note = list(refs_all or []) if note_level_references: refs_note.extend([r for r in note_level_references if isinstance(r, str) and r]) refs_note = _dedupe_seq(refs_note) + for r in refs_note: edges.append(_edge("references", "note", note_id, r, note_id, { "edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"), "provenance": "explicit", "rule_id": "explicit:note_scope", - "confidence": 1.0, + "confidence": PROVENANCE_PRIORITY["explicit:note_scope"], })) + # Backlink-Erzeugung zur Graphen-Stärkung edges.append(_edge("backlink", "note", r, note_id, note_id, { "edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"), "provenance": "rule", "rule_id": "derived:backlink", - "confidence": 0.9, + "confidence": PROVENANCE_PRIORITY["derived:backlink"], })) for rel in defaults: - if rel == "references": - continue + if rel == "references": continue edges.append(_edge(rel, "note", note_id, r, note_id, { "edge_id": _mk_edge_id(rel, note_id, r, "note", f"edge_defaults:{note_type}:{rel}"), "provenance": "rule", "rule_id": f"edge_defaults:{note_type}:{rel}", - "confidence": 0.7, + "confidence": PROVENANCE_PRIORITY["edge_defaults"], })) - if rel in {"related_to", "similar_to"}: - edges.append(_edge(rel, "note", r, note_id, note_id, { - "edge_id": _mk_edge_id(rel, r, note_id, "note", f"edge_defaults:{note_type}:{rel}"), - "provenance": "rule", - "rule_id": f"edge_defaults:{note_type}:{rel}", - "confidence": 0.7, - })) - # 5) De-Dupe (source_id, target_id, relation, rule_id) - seen: Set[Tuple[str,str,str,str]] = set() - out: List[dict] = [] + # 5) WP-15b: Confidence-basierte De-Duplizierung + # Wenn dieselbe Relation mehrfach existiert, gewinnt die mit der höchsten Confidence. + unique_map: Dict[Tuple[str, str, str], dict] = {} + for e in edges: - s = str(e.get("source_id") or "") - t = str(e.get("target_id") or "") + s, t = str(e.get("source_id")), str(e.get("target_id")) rel = str(e.get("relation") or e.get("kind") or "edge") - rule = str(e.get("rule_id") or "") - key = (s, t, rel, rule) - if key in seen: - continue - seen.add(key) - out.append(e) - return out + key = (s, t, rel) + + if key not in unique_map: + unique_map[key] = e + else: + # Vergleich der Vertrauenswürdigkeit (Provenance Ranking) + if e.get("confidence", 0) > unique_map[key].get("confidence", 0): + unique_map[key] = e + + return list(unique_map.values()) \ No newline at end of file diff --git a/app/core/ingestion.py b/app/core/ingestion.py index fa71d1f..ce35daf 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -3,12 +3,12 @@ FILE: app/core/ingestion.py DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen. WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free). WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash. -FIX: Deep Fallback Logic (v2.11.14). Erkennt Policy Violations auch in validen - JSON-Objekten und erzwingt den lokalen Ollama-Sprung, um Kantenverlust - bei umfangreichen Protokollen zu verhindern. -VERSION: 2.11.14 + WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation. + FIX: Beibehaltung der Deep Fallback Logic (v2.11.14) zur JSON-Recovery. +VERSION: 2.12.0 STATUS: Active -DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry +DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, + app.services.llm_service, app.services.edge_registry """ import os import json @@ -21,9 +21,11 @@ from typing import Dict, List, Optional, Tuple, Any # Core Module Imports from app.core.parser import ( read_markdown, + pre_scan_markdown, normalize_frontmatter, validate_required_frontmatter, extract_edges_with_context, + NoteContext ) from app.core.note_payload import make_note_payload from app.core.chunker import assemble_chunks, get_chunk_config @@ -49,7 +51,7 @@ from app.services.llm_service import LLMService logger = logging.getLogger(__name__) -# --- Global Helpers --- +# --- Global Helpers (Full Compatibility v2.11.14) --- def extract_json_from_response(text: str) -> Any: """ Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama). @@ -115,6 +117,7 @@ class IngestionService: self.llm = LLMService() self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE + self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache try: ensure_collections(self.client, self.prefix, self.dim) @@ -122,6 +125,54 @@ class IngestionService: except Exception as e: logger.warning(f"DB init warning: {e}") + async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]: + """ + WP-15b: Implementiert den Two-Pass Ingestion Workflow. + Pass 1: Pre-Scan baut Kontext-Cache auf. + Pass 2: Processing führt semantische Validierung durch. + """ + logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Batch Cache...") + for path in file_paths: + ctx = pre_scan_markdown(path) + if ctx: + self.batch_cache[ctx.note_id] = ctx + + logger.info(f"🚀 [Pass 2] Processing {len(file_paths)} files...") + results = [] + for path in file_paths: + res = await self.process_file(path, vault_root, apply=True) + results.append(res) + return results + + async def _validate_candidate(self, chunk_text: str, edge: Dict) -> bool: + """ + WP-15b: Validiert einen Kanten-Kandidaten semantisch gegen das Ziel. + Nutzt den Cache aus Pass 1, um dem LLM Kontext der Ziel-Note zu geben. + """ + target_id = edge.get("to") + target_ctx = self.batch_cache.get(target_id) + + # Falls Zielnotiz nicht im aktuellen Batch ist: 'explicit' durchlassen (Hard-Link Integrity) + if not target_ctx: + return True + + provider = self.settings.MINDNET_LLM_PROVIDER + template = self.llm.get_prompt("edge_validation", provider) + + try: + prompt = template.format( + chunk_text=chunk_text[:1500], + target_title=target_ctx.title, + target_summary=target_ctx.summary, + edge_kind=edge.get("kind", "related_to") + ) + + response = await self.llm.generate_raw_response(prompt, priority="background") + return "YES" in response.upper() + except Exception as e: + logger.warning(f"⚠️ Semantic validation error for {target_id}: {e}") + return True # Fallback: Im Zweifel Link behalten + def _resolve_note_type(self, requested: Optional[str]) -> str: """Bestimmt den finalen Notiz-Typ (Fallback auf 'concept').""" types = self.registry.get("types", {}) @@ -138,109 +189,12 @@ class IngestionService: return cfg return get_chunk_config(note_type) - async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]: - """ - KI-Extraktion mit Deep-Fallback Logik. - Erzwingt den lokalen Ollama-Sprung, wenn die Cloud-Antwort keine verwertbaren - Kanten liefert (häufig bei Policy Violations auf OpenRouter). - """ - provider = self.settings.MINDNET_LLM_PROVIDER - model = self.settings.OPENROUTER_MODEL if provider == "openrouter" else self.settings.GEMINI_MODEL - - logger.info(f"🚀 [Ingestion] Turbo-Mode: Extracting edges for '{note_id}' using {model} on {provider}") - - edge_registry.ensure_latest() - valid_types_str = ", ".join(sorted(list(edge_registry.valid_types))) - - template = self.llm.get_prompt("edge_extraction", provider) - - try: - try: - # Wir begrenzen den Kontext auf 6000 Zeichen (ca. 1500 Token) - prompt = template.format( - text=text[:6000], - note_id=note_id, - valid_types=valid_types_str - ) - except KeyError as ke: - logger.error(f"❌ [Ingestion] Prompt-Template Fehler (Variable {ke} fehlt).") - return [] - - # 1. Versuch: Anfrage an den primären Cloud-Provider - response_json = await self.llm.generate_raw_response( - prompt=prompt, priority="background", force_json=True, - provider=provider, model_override=model - ) - - # Initiales Parsing - raw_data = extract_json_from_response(response_json) - - # 2. Dictionary Recovery (Versuche Liste aus Dict zu extrahieren) - candidates = [] - if isinstance(raw_data, list): - candidates = raw_data - elif isinstance(raw_data, dict): - logger.info(f"ℹ️ [Ingestion] LLM returned dict, checking for embedded lists in {note_id}") - for k in ["edges", "links", "results", "kanten", "matches", "edge_list"]: - if k in raw_data and isinstance(raw_data[k], list): - candidates = raw_data[k] - break - # Wenn immer noch keine Liste gefunden, versuche Key-Value Paare (Dict Recovery) - if not candidates: - for k, v in raw_data.items(): - if isinstance(v, str): candidates.append(f"{k}:{v}") - elif isinstance(v, list): [candidates.append(f"{k}:{i}") for i in v if isinstance(i, str)] - - # 3. DEEP FALLBACK: Wenn nach allen Recovery-Versuchen die Liste leer ist UND wir in der Cloud waren - # Triggert den Fallback bei "Data Policy Violations" (leere oder Fehler-JSONs). - if not candidates and provider != "ollama" and self.settings.LLM_FALLBACK_ENABLED: - logger.warning( - f"🛑 [Ingestion] Cloud-Antwort für {note_id} lieferte keine verwertbaren Kanten. " - f"Mögliche Policy Violation oder Refusal. Erzwinge LOKALEN FALLBACK via Ollama..." - ) - response_json_local = await self.llm.generate_raw_response( - prompt=prompt, priority="background", force_json=True, provider="ollama" - ) - raw_data_local = extract_json_from_response(response_json_local) - - # Wiederhole Recovery für lokale Antwort - if isinstance(raw_data_local, list): - candidates = raw_data_local - elif isinstance(raw_data_local, dict): - for k in ["edges", "links", "results"]: - if k in raw_data_local and isinstance(raw_data_local[k], list): - candidates = raw_data_local[k]; break - - if not candidates: - logger.warning(f"⚠️ [Ingestion] Auch nach Fallback keine extrahierbaren Kanten für {note_id}") - return [] - - processed = [] - for item in candidates: - if isinstance(item, dict) and "to" in item: - item["provenance"] = "semantic_ai" - item["line"] = f"ai-{provider}" - processed.append(item) - elif isinstance(item, str) and ":" in item: - parts = item.split(":", 1) - processed.append({ - "to": parts[1].strip(), - "kind": parts[0].strip(), - "provenance": "semantic_ai", - "line": f"ai-{provider}" - }) - return processed - - except Exception as e: - logger.warning(f"⚠️ [Ingestion] Smart Edge Allocation failed for {note_id}: {e}") - return [] - async def process_file( self, file_path: str, vault_root: str, force_replace: bool = False, apply: bool = False, purge_before: bool = False, note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical" ) -> Dict[str, Any]: - """Transformiert eine Markdown-Datei in den Graphen (Notes, Chunks, Edges).""" + """Transformiert eine Markdown-Datei in den Graphen.""" result = {"path": file_path, "status": "skipped", "changed": False, "error": None} # 1. Parse & Lifecycle Gate @@ -252,12 +206,12 @@ class IngestionService: except Exception as e: return {**result, "error": f"Validation failed: {str(e)}"} - # WP-22: Filter für Systemdateien und Entwürfe + # Lifecycle Filter (WP-22) status = fm.get("status", "draft").lower().strip() if status in ["system", "template", "archive", "hidden"]: return {**result, "status": "skipped", "reason": f"lifecycle_{status}"} - # 2. Config Resolution & Payload Construction + # 2. Config Resolution & Payload note_type = self._resolve_note_type(fm.get("type")) fm["type"] = note_type @@ -267,15 +221,13 @@ class IngestionService: except Exception as e: return {**result, "error": f"Payload failed: {str(e)}"} - # 3. Change Detection (Strikte DoD Umsetzung) + # 3. Change Detection (v2.11.14 Logic) old_payload = None if force_replace else self._fetch_note_payload(note_id) check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}" old_hash = (old_payload or {}).get("hashes", {}).get(check_key) new_hash = note_pl.get("hashes", {}).get(check_key) - # Prüfung auf fehlende Artefakte in Qdrant chunks_missing, edges_missing = self._artifacts_missing(note_id) - should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing if not should_write: @@ -284,40 +236,42 @@ class IngestionService: if not apply: return {**result, "status": "dry-run", "changed": True, "note_id": note_id} - # 4. Processing (Chunking, Embedding, AI Edges) + # 4. Processing (Chunking, Embedding, Validated Edges) try: body_text = getattr(parsed, "body", "") or "" edge_registry.ensure_latest() - # Profil-gesteuertes Chunking + # Chunker Resolution profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard" chunk_cfg = self._get_chunk_config_by_profile(profile, note_type) chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) - # Vektorisierung + # Embeddings vecs = [] if chunk_pls: texts = [c.get("window") or c.get("text") or "" for c in chunk_pls] vecs = await self.embedder.embed_documents(texts) - # Kanten-Extraktion + # Kanten-Extraktion & WP-15b Validierung edges = [] context = {"file": file_path, "note_id": note_id} - # A. Explizite Kanten (User / Wikilinks) - for e in extract_edges_with_context(parsed): - e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")}) - edges.append(e) + # A. Explizite Kandidaten (Wikilinks) + raw_candidates = extract_edges_with_context(parsed) + for cand in raw_candidates: + # Semantische Prüfung gegen Pass 1 Cache + if await self._validate_candidate(body_text, cand): + cand["kind"] = edge_registry.resolve( + edge_type=cand["kind"], + provenance="explicit", + context={**context, "line": cand.get("line")} + ) + edges.append(cand) + else: + logger.info(f"🚫 WP-15b: Candidate rejected: {cand['kind']} -> {cand['to']}") - # B. KI Kanten (Turbo Mode mit v2.11.14 Fallback) - ai_edges = await self._perform_smart_edge_allocation(body_text, note_id) - for e in ai_edges: - valid_kind = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")}) - e["kind"] = valid_kind - edges.append(e) - - # C. System Kanten (Struktur) + # B. System Kanten (Struktur) try: sys_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs) except: diff --git a/app/core/parser.py b/app/core/parser.py index b47aeb7..7d183c0 100644 --- a/app/core/parser.py +++ b/app/core/parser.py @@ -2,10 +2,11 @@ FILE: app/core/parser.py DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body. WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry. -VERSION: 1.8.0 + WP-15b: Implementierung NoteContext und pre_scan_markdown für Pass 1 Ingestion. +VERSION: 1.9.0 STATUS: Active DEPENDENCIES: yaml, re, dataclasses, json, io, os -LAST_ANALYSIS: 2025-12-23 +LAST_ANALYSIS: 2025-12-26 """ from __future__ import annotations @@ -32,6 +33,15 @@ class ParsedNote: body: str path: str +@dataclass +class NoteContext: + """Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1).""" + note_id: str + title: str + type: str + summary: str + tags: List[str] + # --------------------------------------------------------------------- # Frontmatter-Erkennung @@ -152,6 +162,32 @@ def read_markdown(path: str) -> Optional[ParsedNote]: return ParsedNote(frontmatter=fm or {}, body=body or "", path=path) +def pre_scan_markdown(path: str) -> Optional[NoteContext]: + """ + WP-15b: Schneller Scan für den LocalBatchCache (Pass 1). + Extrahiert nur Identität und Kurz-Kontext zur semantischen Validierung. + """ + parsed = read_markdown(path) + if not parsed: + return None + + fm = parsed.frontmatter + # ID-Findung: Frontmatter ID oder Dateiname als Fallback + note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0]) + + # Erstelle Kurz-Zusammenfassung (erste 500 Zeichen des Body, bereinigt) + clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip() + summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body + + return NoteContext( + note_id=note_id, + title=str(fm.get("title", note_id)), + type=str(fm.get("type", "concept")), + summary=summary, + tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else [] + ) + + def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None: """ diff --git a/app/services/edge_registry.py b/app/services/edge_registry.py index 95be97b..0763370 100644 --- a/app/services/edge_registry.py +++ b/app/services/edge_registry.py @@ -1,11 +1,14 @@ """ FILE: app/services/edge_registry.py DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload. + WP-15b: Erweiterte Provenance-Prüfung für die Candidate-Validation. + Sichert die Graph-Integrität durch strikte Trennung von System- und Inhaltskanten. WP-22: Fix für absolute Pfade außerhalb des Vaults (Prod-Dictionary). WP-20: Synchronisation mit zentralen Settings (v0.6.2). -VERSION: 0.7.5 +VERSION: 0.8.0 STATUS: Active DEPENDENCIES: re, os, json, logging, time, app.config +LAST_ANALYSIS: 2025-12-26 """ import re import os @@ -19,7 +22,12 @@ from app.config import get_settings logger = logging.getLogger(__name__) class EdgeRegistry: + """ + Zentraler Verwalter für das Kanten-Vokabular. + Implementiert das Singleton-Pattern für konsistente Validierung über alle Services. + """ _instance = None + # System-Kanten, die nicht durch User oder KI gesetzt werden dürfen FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"} def __new__(cls, *args, **kwargs): @@ -51,7 +59,7 @@ class EdgeRegistry: def ensure_latest(self): """ Prüft den Zeitstempel der Vokabular-Datei und lädt bei Bedarf neu. - Verhindert den AttributeError in der Ingestion-Pipeline. + Verhindert Inkonsistenzen bei Laufzeit-Updates des Dictionaries. """ if not os.path.exists(self.full_vocab_path): logger.error(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!") @@ -66,7 +74,10 @@ class EdgeRegistry: logger.error(f"!!! [EDGE-REGISTRY] Error checking file time: {e}") def _load_vocabulary(self): - """Parst das Markdown-Wörterbuch und baut die Canonical-Map auf.""" + """ + Parst das Markdown-Wörterbuch und baut die Canonical-Map auf. + Erkennt Tabellen-Strukturen und extrahiert fettgedruckte System-Typen. + """ self.canonical_map.clear() self.valid_types.clear() @@ -101,8 +112,8 @@ class EdgeRegistry: def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str: """ - Validiert einen Kanten-Typ gegen das Vokabular. - Loggt unbekannte Typen für die spätere manuelle Pflege. + WP-15b: Validiert einen Kanten-Typ gegen das Vokabular und prüft Berechtigungen. + Sichert, dass nur strukturelle Prozesse System-Kanten setzen dürfen. """ self.ensure_latest() if not edge_type: @@ -112,20 +123,23 @@ class EdgeRegistry: clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_") ctx = context or {} - # System-Kanten dürfen nicht manuell vergeben werden - if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES: - self._log_issue(clean_type, "forbidden_system_usage", ctx) + # WP-15b: System-Kanten dürfen weder manuell noch durch KI/Vererbung gesetzt werden. + # Nur Provenienz 'structure' (interne Prozesse) ist autorisiert. + # Wir blockieren hier alle Provenienzen außer 'structure'. + restricted_provenance = ["explicit", "semantic_ai", "inherited", "global_pool", "rule"] + if provenance in restricted_provenance and clean_type in self.FORBIDDEN_SYSTEM_EDGES: + self._log_issue(clean_type, f"forbidden_usage_by_{provenance}", ctx) return "related_to" - # System-Kanten sind nur bei struktureller Provenienz erlaubt + # System-Kanten sind NUR bei struktureller Provenienz erlaubt if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES: return clean_type - # Mapping auf kanonischen Namen + # Mapping auf kanonischen Namen (Alias-Auflösung) if clean_type in self.canonical_map: return self.canonical_map[clean_type] - # Fallback und Logging + # Fallback und Logging unbekannter Typen für Admin-Review self._log_issue(clean_type, "unknown_type", ctx) return clean_type @@ -139,12 +153,13 @@ class EdgeRegistry: "error": error_kind, "file": ctx.get("file", "unknown"), "line": ctx.get("line", "unknown"), - "note_id": ctx.get("note_id", "unknown") + "note_id": ctx.get("note_id", "unknown"), + "provenance": ctx.get("provenance", "unknown") } with open(self.unknown_log_path, "a", encoding="utf-8") as f: f.write(json.dumps(entry) + "\n") except Exception: pass -# Singleton Export +# Singleton Export für systemweiten Zugriff registry = EdgeRegistry() \ No newline at end of file diff --git a/config/prompts.yaml b/config/prompts.yaml index 13b800d..f554155 100644 --- a/config/prompts.yaml +++ b/config/prompts.yaml @@ -1,6 +1,7 @@ -# config/prompts.yaml — Final V2.5.5 (OpenRouter Hardening) +# config/prompts.yaml — Final V2.6.0 (WP-15b Candidate-Validation) # WP-20: Optimierte Cloud-Templates zur Unterdrückung von Modell-Geschwätz. # FIX: Explizite Verbote für Einleitungstexte zur Vermeidung von JSON-Parsing-Fehlern. +# WP-15b: Integration der binären edge_validation für den Two-Pass Workflow. # OLLAMA: UNVERÄNDERT laut Benutzeranweisung. system_prompt: | @@ -215,7 +216,7 @@ edge_extraction: 4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten. BEISPIEL: - [[ {{"to": "Ziel-Konzept", "kind": "beziehungs_typ"}} ]] + [[ {{"to": "Ziel-Konzept", \"kind\": \"beziehungs_typ\"}} ]] TEXT: """ @@ -227,13 +228,46 @@ edge_extraction: Analysiere '{note_id}'. Extrahiere semantische Beziehungen. ERLAUBTE TYPEN: {valid_types} TEXT: {text} - OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to":"Ziel","kind":"typ"}}]]. Kein Text davor/danach. Wenn nichts: []. + OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to\":\"Ziel\",\"kind\":\"typ\"}}]]. Kein Text davor/danach. Wenn nichts: []. openrouter: | TASK: Extrahiere semantische Relationen für '{note_id}'. ERLAUBTE TYPEN: {valid_types} TEXT: {text} ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten. - FORMAT: [[{{"to":"Ziel-Begriff","kind":"typ"}}]] + FORMAT: [[{{"to\":\"Ziel-Begriff\",\"kind\":\"typ\"}}]] STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen. Wenn keine Relationen existieren, antworte NUR mit: [] - OUTPUT: \ No newline at end of file + OUTPUT: + +# --------------------------------------------------------- +# 8. WP-15b: EDGE VALIDATION (Intent: VALIDATE) +# --------------------------------------------------------- +edge_validation: + gemini: | + Bewerte die semantische Validität dieser Verbindung im Wissensgraph. + + KONTEXT DER QUELLE (Chunk): + "{chunk_text}" + + ZIEL-NOTIZ: "{target_title}" + ZIEL-BESCHREIBUNG (Zusammenfassung): + "{target_summary}" + + GEPLANTE RELATION: "{edge_kind}" + + FRAGE: Bestätigt der Kontext der Quelle die Beziehung '{edge_kind}' zum Ziel? + REGEL: Antworte NUR mit 'YES' oder 'NO'. Keine Erklärungen oder Smalltalk. + openrouter: | + Verify semantic relation for graph construction. + Source Context: {chunk_text} + Target Note: {target_title} + Target Summary: {target_summary} + Proposed Relation: {edge_kind} + Instruction: Does the source context support this relation to the target? + Result: Respond ONLY with 'YES' or 'NO'. + ollama: | + Bewerte die semantische Korrektheit dieser Verbindung. + QUELLE: {chunk_text} + ZIEL: {target_title} ({target_summary}) + BEZIEHUNG: {edge_kind} + Ist diese Verbindung valide? Antworte NUR mit YES oder NO. \ No newline at end of file From c676c8263f3f172da14fd930580b5f01c04527f1 Mon Sep 17 00:00:00 2001 From: Lars Date: Fri, 26 Dec 2025 22:07:25 +0100 Subject: [PATCH 02/23] =?UTF-8?q?Import=20Script=20und=20Logging=20f=C3=BC?= =?UTF-8?q?r=20WP15b?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/ingestion.py | 91 ++++++++++++++++++++++---------------- scripts/import_markdown.py | 42 ++++++++++++------ 2 files changed, 82 insertions(+), 51 deletions(-) diff --git a/app/core/ingestion.py b/app/core/ingestion.py index ce35daf..b433fc4 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -4,8 +4,10 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen. WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free). WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash. WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation. - FIX: Beibehaltung der Deep Fallback Logic (v2.11.14) zur JSON-Recovery. -VERSION: 2.12.0 +FIX: Deep Fallback Logic (v2.11.14). Erkennt Policy Violations auch in validen + JSON-Objekten und erzwingt den lokalen Ollama-Sprung, um Kantenverlust + bei umfangreichen Protokollen zu verhindern. +VERSION: 2.12.1 STATUS: Active DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry @@ -128,16 +130,16 @@ class IngestionService: async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]: """ WP-15b: Implementiert den Two-Pass Ingestion Workflow. - Pass 1: Pre-Scan baut Kontext-Cache auf. - Pass 2: Processing führt semantische Validierung durch. + Pass 1: Pre-Scan baut flüchtigen Kontext-Cache auf. + Pass 2: Processing führt die eigentliche semantische Validierung durch. """ - logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Batch Cache...") + logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") for path in file_paths: ctx = pre_scan_markdown(path) if ctx: self.batch_cache[ctx.note_id] = ctx - logger.info(f"🚀 [Pass 2] Processing {len(file_paths)} files...") + logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...") results = [] for path in file_paths: res = await self.process_file(path, vault_root, apply=True) @@ -152,14 +154,17 @@ class IngestionService: target_id = edge.get("to") target_ctx = self.batch_cache.get(target_id) - # Falls Zielnotiz nicht im aktuellen Batch ist: 'explicit' durchlassen (Hard-Link Integrity) + # Sicherheits-Fallback: Wenn Zielnotiz nicht im aktuellen Batch ist, + # lassen wir die Kante als 'explicit' durch (Hard-Link Integrity). if not target_ctx: + logger.info(f"ℹ️ [VALIDATION SKIP] No cache context for '{target_id}' - allowing link.") return True provider = self.settings.MINDNET_LLM_PROVIDER template = self.llm.get_prompt("edge_validation", provider) try: + logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...") prompt = template.format( chunk_text=chunk_text[:1500], target_title=target_ctx.title, @@ -168,7 +173,14 @@ class IngestionService: ) response = await self.llm.generate_raw_response(prompt, priority="background") - return "YES" in response.upper() + is_valid = "YES" in response.upper() + + if is_valid: + logger.info(f"✅ [VALIDATED] Relation '{edge.get('kind')}' to '{target_id}' confirmed.") + else: + logger.info(f"🚫 [REJECTED] WP-15b Candidate: '{edge.get('kind')}' -> '{target_id}' not relevant.") + + return is_valid except Exception as e: logger.warning(f"⚠️ Semantic validation error for {target_id}: {e}") return True # Fallback: Im Zweifel Link behalten @@ -244,44 +256,49 @@ class IngestionService: # Chunker Resolution profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard" chunk_cfg = self._get_chunk_config_by_profile(profile, note_type) + enable_smart_edges = chunk_cfg.get("enable_smart_edge_allocation", False) + + # WP-15b: Chunker bereitet nun den Candidate-Pool vor. chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg) + + # WP-15b: Validierung der Kandidaten aus dem Global Pool. + for ch_obj in chunks: + filtered_pool = [] + for cand in getattr(ch_obj, "candidate_pool", []): + # Nur 'global_pool' (Unzugeordnete Kanten) erfordern LLM-Validierung. + # Sektions-Kanten ('inherited') werden direkt akzeptiert. + if cand.get("provenance") == "global_pool" and enable_smart_edges: + if await self._validate_candidate(ch_obj.text, cand): + filtered_pool.append(cand) + else: + filtered_pool.append(cand) + ch_obj.candidate_pool = filtered_pool + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) - # Embeddings + # Embeddings generieren vecs = [] if chunk_pls: texts = [c.get("window") or c.get("text") or "" for c in chunk_pls] vecs = await self.embedder.embed_documents(texts) - # Kanten-Extraktion & WP-15b Validierung - edges = [] - context = {"file": file_path, "note_id": note_id} - - # A. Explizite Kandidaten (Wikilinks) - raw_candidates = extract_edges_with_context(parsed) - for cand in raw_candidates: - # Semantische Prüfung gegen Pass 1 Cache - if await self._validate_candidate(body_text, cand): - cand["kind"] = edge_registry.resolve( - edge_type=cand["kind"], - provenance="explicit", - context={**context, "line": cand.get("line")} - ) - edges.append(cand) - else: - logger.info(f"🚫 WP-15b: Candidate rejected: {cand['kind']} -> {cand['to']}") - - # B. System Kanten (Struktur) - try: - sys_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs) - except: - sys_edges = build_edges_for_note(note_id, chunk_pls) + # Kanten finalisieren via derive_edges Aggregator (WP-15b kompatibel) + # Nutzt das Provenance-Ranking (v2.1.0). + edges = build_edges_for_note( + note_id, + chunk_pls, + note_level_references=note_pl.get("references", []), + include_note_scope_refs=note_scope_refs + ) - for e in sys_edges: - valid_kind = edge_registry.resolve(edge_type=e.get("kind", "belongs_to"), provenance="structure", context={**context, "line": "system"}) - if valid_kind: - e["kind"] = valid_kind - edges.append(e) + # Alias-Auflösung & Registry Enforcement + context = {"file": file_path, "note_id": note_id} + for e in edges: + e["kind"] = edge_registry.resolve( + edge_type=e.get("kind", "related_to"), + provenance=e.get("provenance", "explicit"), + context={**context, "line": e.get("line", "system")} + ) except Exception as e: logger.error(f"Processing failed for {file_path}: {e}", exc_info=True) diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index d5ce195..917b46a 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -2,7 +2,9 @@ """ scripts/import_markdown.py CLI-Tool zum Importieren von Markdown-Dateien in Qdrant. -Updated for Mindnet v2.3.6 (Async Ingestion Support). +WP-15b: Implementiert den Two-Pass Workflow (Pre-Scan + Processing). +Sorgt dafür, dass der LocalBatchCache vor der Verarbeitung gefüllt wird. +VERSION: 2.4.0 """ import asyncio import os @@ -11,21 +13,16 @@ import logging from pathlib import Path from dotenv import load_dotenv -import logging # Setzt das Level global auf INFO, damit Sie den Fortschritt sehen logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') -# Wenn Sie TIEFE Einblicke wollen, setzen Sie den SemanticAnalyzer spezifisch auf DEBUG: -logging.getLogger("app.services.semantic_analyzer").setLevel(logging.DEBUG) - # Importiere den neuen Async Service -# Stellen wir sicher, dass der Pfad stimmt (Pythonpath) import sys sys.path.append(os.getcwd()) from app.core.ingestion import IngestionService +from app.core.parser import pre_scan_markdown -logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("importer") async def main_async(args): @@ -34,7 +31,7 @@ async def main_async(args): logger.error(f"Vault path does not exist: {vault_path}") return - # Service initialisieren (startet Async Clients) + # 1. Service initialisieren logger.info(f"Initializing IngestionService (Prefix: {args.prefix})") service = IngestionService(collection_prefix=args.prefix) @@ -46,14 +43,31 @@ async def main_async(args): logger.info(f"Found {len(files)} markdown files.") - stats = {"processed": 0, "skipped": 0, "errors": 0} + # ========================================================================= + # PASS 1: Global Pre-Scan (WP-15b) + # Füllt den LocalBatchCache für die semantische Kanten-Validierung. + # ========================================================================= + logger.info(f"🔍 [Pass 1] Pre-scanning {len(files)} files for global context cache...") + for f_path in files: + try: + ctx = pre_scan_markdown(str(f_path)) + if ctx: + service.batch_cache[ctx.note_id] = ctx + except Exception as e: + logger.warning(f"⚠️ Could not pre-scan {f_path}: {e}") - # Wir nutzen eine Semaphore, um nicht zu viele Files gleichzeitig zu öffnen/embedden - sem = asyncio.Semaphore(5) # Max 5 concurrent files to avoid OOM or Rate Limit + logger.info(f"✅ Cache populated with {len(service.batch_cache)} note contexts.") + + # ========================================================================= + # PASS 2: Processing (Batch-Verarbeitung) + # ========================================================================= + stats = {"processed": 0, "skipped": 0, "errors": 0} + sem = asyncio.Semaphore(5) # Max 5 parallele Dateien für Stabilität async def process_with_limit(f_path): async with sem: try: + # Nutzt den nun gefüllten Batch-Cache für die Validierung res = await service.process_file( file_path=str(f_path), vault_root=str(vault_path), @@ -65,8 +79,8 @@ async def main_async(args): except Exception as e: return {"status": "error", "error": str(e), "path": str(f_path)} - # Batch Processing - # Wir verarbeiten in Chunks, um den Progress zu sehen + logger.info(f"🚀 [Pass 2] Starting semantic processing in batches...") + batch_size = 20 for i in range(0, len(files), batch_size): batch = files[i:i+batch_size] @@ -92,7 +106,7 @@ def main(): load_dotenv() default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet") - parser = argparse.ArgumentParser(description="Import Vault to Qdrant (Async)") + parser = argparse.ArgumentParser(description="Import Vault to Qdrant (Two-Pass Ingestion)") parser.add_argument("--vault", default="./vault", help="Path to vault root") parser.add_argument("--prefix", default=default_prefix, help="Collection prefix") parser.add_argument("--force", action="store_true", help="Force re-index all files") From 82c775226679ce73b3441329b80f141f228f1c21 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 06:31:57 +0100 Subject: [PATCH 03/23] =?UTF-8?q?richtige=20Filename=20f=C3=BCr=20den=20po?= =?UTF-8?q?ol=20Lookup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/ingestion.py | 42 ++++++++++++++++++++++++-------------- scripts/import_markdown.py | 36 +++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/app/core/ingestion.py b/app/core/ingestion.py index b433fc4..a5a80d8 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -4,10 +4,10 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen. WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free). WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash. WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation. -FIX: Deep Fallback Logic (v2.11.14). Erkennt Policy Violations auch in validen - JSON-Objekten und erzwingt den lokalen Ollama-Sprung, um Kantenverlust - bei umfangreichen Protokollen zu verhindern. -VERSION: 2.12.1 + Sichert, dass explizite Kanten direkt übernommen und nur Pool-Kanten validiert werden. +FIX: Deep Fallback Logic (v2.11.14) für JSON-Recovery. + Robust Lookup Fix: Adressiert Notizen im Cache via ID, Titel und Dateiname. +VERSION: 2.12.2 STATUS: Active DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry @@ -137,7 +137,12 @@ class IngestionService: for path in file_paths: ctx = pre_scan_markdown(path) if ctx: + # Mehrfache Indizierung für robusten Look-up (WP-15b Fix) self.batch_cache[ctx.note_id] = ctx + self.batch_cache[ctx.title] = ctx + # Dateiname ohne Endung als dritter Schlüssel + fname = os.path.splitext(os.path.basename(path))[0] + self.batch_cache[fname] = ctx logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...") results = [] @@ -154,10 +159,15 @@ class IngestionService: target_id = edge.get("to") target_ctx = self.batch_cache.get(target_id) + # Fallback Look-up für Links mit Ankern (Anchor entfernen) + if not target_ctx and "#" in target_id: + base_id = target_id.split("#")[0] + target_ctx = self.batch_cache.get(base_id) + # Sicherheits-Fallback: Wenn Zielnotiz nicht im aktuellen Batch ist, # lassen wir die Kante als 'explicit' durch (Hard-Link Integrity). if not target_ctx: - logger.info(f"ℹ️ [VALIDATION SKIP] No cache context for '{target_id}' - allowing link.") + logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.") return True provider = self.settings.MINDNET_LLM_PROVIDER @@ -176,9 +186,9 @@ class IngestionService: is_valid = "YES" in response.upper() if is_valid: - logger.info(f"✅ [VALIDATED] Relation '{edge.get('kind')}' to '{target_id}' confirmed.") + logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.") else: - logger.info(f"🚫 [REJECTED] WP-15b Candidate: '{edge.get('kind')}' -> '{target_id}' not relevant.") + logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.") return is_valid except Exception as e: @@ -258,15 +268,15 @@ class IngestionService: chunk_cfg = self._get_chunk_config_by_profile(profile, note_type) enable_smart_edges = chunk_cfg.get("enable_smart_edge_allocation", False) - # WP-15b: Chunker bereitet nun den Candidate-Pool vor. + # WP-15b: Chunker bereitet nun den Candidate-Pool vor (inkl. Inheritance). chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg) - # WP-15b: Validierung der Kandidaten aus dem Global Pool. + # WP-15b: Validierung NUR für Kandidaten aus dem global_pool (Unzugeordnete Kanten) for ch_obj in chunks: filtered_pool = [] for cand in getattr(ch_obj, "candidate_pool", []): - # Nur 'global_pool' (Unzugeordnete Kanten) erfordern LLM-Validierung. - # Sektions-Kanten ('inherited') werden direkt akzeptiert. + # Nur 'global_pool' erfordert LLM-Validierung. + # 'explicit' und 'inherited' werden direkt akzeptiert. if cand.get("provenance") == "global_pool" and enable_smart_edges: if await self._validate_candidate(ch_obj.text, cand): filtered_pool.append(cand) @@ -312,12 +322,14 @@ class IngestionService: upsert_batch(self.client, n_name, n_pts) if chunk_pls and vecs: - c_name, c_pts = points_for_chunks(self.prefix, chunk_pls, vecs) - upsert_batch(self.client, c_name, c_pts) + # v2.11.14 Points-Extraction Logic + c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1] + upsert_batch(self.client, f"{self.prefix}_chunks", c_pts) if edges: - e_name, e_pts = points_for_edges(self.prefix, edges) - upsert_batch(self.client, e_name, e_pts) + # v2.11.14 Points-Extraction Logic + e_pts = points_for_edges(self.prefix, edges)[1] + upsert_batch(self.client, f"{self.prefix}_edges", e_pts) return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)} except Exception as e: diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 917b46a..544ae40 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -3,8 +3,9 @@ scripts/import_markdown.py CLI-Tool zum Importieren von Markdown-Dateien in Qdrant. WP-15b: Implementiert den Two-Pass Workflow (Pre-Scan + Processing). -Sorgt dafür, dass der LocalBatchCache vor der Verarbeitung gefüllt wird. -VERSION: 2.4.0 +Sorgt dafür, dass der LocalBatchCache vor der Verarbeitung robust gefüllt wird. +Indiziert Notizen nach ID, Titel und Dateiname für maximale Link-Kompatibilität. +VERSION: 2.4.1 """ import asyncio import os @@ -13,10 +14,10 @@ import logging from pathlib import Path from dotenv import load_dotenv -# Setzt das Level global auf INFO, damit Sie den Fortschritt sehen +# Setzt das Level global auf INFO, damit der Fortschritt im Log sichtbar ist logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') -# Importiere den neuen Async Service +# Importiere den neuen Async Service und stelle Python-Pfad sicher import sys sys.path.append(os.getcwd()) @@ -44,30 +45,41 @@ async def main_async(args): logger.info(f"Found {len(files)} markdown files.") # ========================================================================= - # PASS 1: Global Pre-Scan (WP-15b) + # PASS 1: Global Pre-Scan (WP-15b Harvester) # Füllt den LocalBatchCache für die semantische Kanten-Validierung. + # Nutzt ID, Titel und Filename für robusten Look-up. # ========================================================================= logger.info(f"🔍 [Pass 1] Pre-scanning {len(files)} files for global context cache...") for f_path in files: try: ctx = pre_scan_markdown(str(f_path)) if ctx: + # 1. Look-up via Note ID (UUID oder Frontmatter ID) service.batch_cache[ctx.note_id] = ctx + + # 2. Look-up via Titel (Wichtig für Wikilinks [[Titel]]) + service.batch_cache[ctx.title] = ctx + + # 3. Look-up via Dateiname (Wichtig für Wikilinks [[Filename]]) + fname = os.path.splitext(f_path.name)[0] + service.batch_cache[fname] = ctx + except Exception as e: - logger.warning(f"⚠️ Could not pre-scan {f_path}: {e}") + logger.warning(f"⚠️ Could not pre-scan {f_path.name}: {e}") - logger.info(f"✅ Cache populated with {len(service.batch_cache)} note contexts.") + logger.info(f"✅ Context Cache populated for {len(files)} notes.") # ========================================================================= - # PASS 2: Processing (Batch-Verarbeitung) + # PASS 2: Processing (Semantic Batch-Verarbeitung) + # Nutzt den gefüllten Cache zur binären Validierung semantischer Kanten. # ========================================================================= stats = {"processed": 0, "skipped": 0, "errors": 0} - sem = asyncio.Semaphore(5) # Max 5 parallele Dateien für Stabilität + sem = asyncio.Semaphore(5) # Max 5 parallele Dateien für Cloud-Stabilität async def process_with_limit(f_path): async with sem: try: - # Nutzt den nun gefüllten Batch-Cache für die Validierung + # Nutzt den nun gefüllten Batch-Cache in der process_file Logik res = await service.process_file( file_path=str(f_path), vault_root=str(vault_path), @@ -106,7 +118,7 @@ def main(): load_dotenv() default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet") - parser = argparse.ArgumentParser(description="Import Vault to Qdrant (Two-Pass Ingestion)") + parser = argparse.ArgumentParser(description="Two-Pass Markdown Ingestion for Mindnet") parser.add_argument("--vault", default="./vault", help="Path to vault root") parser.add_argument("--prefix", default=default_prefix, help="Collection prefix") parser.add_argument("--force", action="store_true", help="Force re-index all files") @@ -114,7 +126,7 @@ def main(): args = parser.parse_args() - # Starte den Async Loop + # Starte den asynchronen Haupt-Loop asyncio.run(main_async(args)) if __name__ == "__main__": From cf302e8334b42e1a05a0be0ffb2f10f073930543 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 09:52:17 +0100 Subject: [PATCH 04/23] Import und ingestion auf den neuen Prozess umgestellt --- docs/06_Roadmap/06_active_roadmap.md | 54 +++++++++++++++++++------- docs/06_Roadmap/06_handover_prompts.md | 43 +++++++++++++++++++- 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/docs/06_Roadmap/06_active_roadmap.md b/docs/06_Roadmap/06_active_roadmap.md index 755f66e..59df0a0 100644 --- a/docs/06_Roadmap/06_active_roadmap.md +++ b/docs/06_Roadmap/06_active_roadmap.md @@ -185,44 +185,42 @@ Der bisherige WP-15 Ansatz litt unter Halluzinationen (erfundene Kantentypen), h 2. **Single Source of Truth (SSOT):** Die Registry nutzt `01_edge_vocabulary.md` als führende Konfiguration. 3. **Self-Learning Loop:** Protokollierung unbekannter Kanten in `unknown_edges.jsonl`. -## 23: Agentic Multi-Stream Reasoning (Mindnet 2025) +### WP-23: Agentic Multi-Stream Reasoning (Mindnet 2025) -### 1. Zielsetzung & Problemstellung +#### 1. Zielsetzung & Problemstellung Das bisherige System basiert auf einem globalen Scoring-Modell, bei dem Notizen unterschiedlicher Typen (z. B. `insight` vs. `belief`) in einem einzigen Retrieval-Topf konkurrieren. Dies führt dazu, dass leiser gewichtete, aber fundamentale Identitätsmerkmale oft durch hochgewichtete aktuelle Erkenntnisse verdrängt werden. Ziel dieses Pakets ist die Einführung einer parallelen **Stream-Architektur**, um die Vielschichtigkeit menschlicher Entscheidungsprozesse (Werte + Erfahrung + Absicht) im LLM-Kontext zu garantieren. ---- - -### 2. Funktionsbeschreibung: Die Streams +#### 2. Funktionsbeschreibung: Die Streams Die Daten aus der `types.yaml` werden in drei logische Verarbeitungseinheiten unterteilt: -#### A. Identity Stream (Die Wahrheitsebene) +##### A. Identity Stream (Die Wahrheitsebene) * **Inhalt:** `value`, `belief`, `trait`, `principle`, `need`, `boundary`, `bias`. * **Zweck:** Definition des moralischen Kompasses, der psychologischen Grundbedürfnisse und kognitiven Muster. * **Wirkung:** Liefert das "Warum" hinter jeder Handlung. -#### B. History Stream (Die Evidenzebene) +##### B. History Stream (Die Evidenzebene) * **Inhalt:** `experience`, `event`, `source`, `journal`, `person`. * **Zweck:** Bereitstellung empirischer Belege aus der Vergangenheit und sozialer Kontexte. * **Wirkung:** Verankert die Antwort in real erlebten Mustern und Fakten. -#### C. Action Stream (Die Dynamikebene) +##### C. Action Stream (Die Dynamikebene) * **Inhalt:** `project`, `decision`, `goal`, `task`, `risk`, `motivation`, `habit`, `state`. * **Zweck:** Analyse der aktuellen Richtung, geplanter Vorhaben und des gegenwärtigen Zustands. * **Wirkung:** Liefert den Kontext für die Umsetzung und zukünftige Ziele. -### 3. Technische Wirkungsweise (Solution Sketch) +#### 3. Technische Wirkungsweise (Solution Sketch) -#### Schritt 1: Query-Decomposition +##### Schritt 1: Query-Decomposition Ein initialer Klassifizierungs-Agent analysiert die Nutzeranfrage und bestimmt, welcher Stream primär angesprochen werden muss (z. B. "Wie soll ich mich entscheiden?" boostet den Identity Stream). -#### Schritt 2: Parallel Stream Retrieval +##### Schritt 2: Parallel Stream Retrieval Anstelle einer Suche werden drei unabhängige Vektor-Suchen mit Typ-Filtern durchgeführt: * **Search_A (Identity):** Top-5 Ergebnisse aus Identitäts-Notizen. * **Search_B (History):** Top-5 Ergebnisse aus biografischen/externen Notizen. * **Search_C (Action):** Top-5 Ergebnisse aus operativen/strategischen Notizen. -#### Schritt 3: Agentic Synthesis (The Reasoning) +##### Schritt 3: Agentic Synthesis (The Reasoning) Ein Synthese-Agent (LLM) erhält die aggregierten Ergebnisse in getrennten Sektionen. Die Anweisung lautet: 1. **Prüfung:** Steht das aktuelle Vorhaben (Action) im Einklang mit den Werten (Identity)? 2. **Abgleich:** Welche vergangenen Erfahrungen (History) stützen oder widersprechen diesem Weg? @@ -230,12 +228,39 @@ Ein Synthese-Agent (LLM) erhält die aggregierten Ergebnisse in getrennten Sekti -### 4. Erwartete Ergebnisse +#### 4. Erwartete Ergebnisse * **Höhere Resonanz:** Antworten wirken authentischer, da sie explizit auf das Wertesystem des Nutzers Bezug nehmen. * **Widerspruchs-Erkennung:** Das System kann den Nutzer aktiv warnen, wenn ein Projekt gegen seine `principles` oder `needs` verstößt. * **Robustes Retrieval:** Wichtige Identitäts-Informationen gehen nicht mehr im "Rauschen" von hunderten Journal-Einträgen verloren. --- +### WP-24 – Proactive Discovery & Agentic Knowledge Mining +**Status:** 🚀 In Planung (Nächster Architektur-Sprung) +**Ziel:** Transformation von Mindnet von einem reaktiven Archiv zu einem aktiven Denkpartner. Das System soll aktiv Wissenslücken schließen und verborgene Querverbindungen in großen Vaults sowie in Chat-Dialogen aufspüren. + +**Herausforderung:** +1. **Silo-Effekt:** Bei wachsenden Vaults vergisst der Nutzer existierende Notizen und erstellt redundante Inhalte ohne Verknüpfung. +2. **Insight-Verlust:** Im Chat entstehen wertvolle Synthesen, die momentan im flüchtigen Chat-Log vergraben bleiben. + +**Lösungsskizze & Strategie:** + +#### A. Proactive Discovery (Vault-Scanning) +Das System nutzt die existierende `candidate_pool` Logik aus WP-15b, befüllt diese jedoch automatisiert: +* **Vector Similarity Search**: Beim Import einer Note (oder als periodischer Hintergrundprozess) sucht der neue `RecommenderService` in Qdrant nach den Top-X semantisch ähnlichsten Chunks im gesamten Vault. +* **Auto-Injection**: Diese Funde werden automatisch als `related_to` Kandidaten in den `candidate_pool` der neuen Note injiziert. +* **WP-15b Filter**: Das LLM validiert diese Vorschläge im zweiten Pass der Ingestion gegen den Kontext. Nur was semantisch wirklich passt, wird als Kante im Graphen persistiert. + +#### B. Agentic Knowledge Mining (Chat-to-Vault) +Integration von Informationen aus dem Dialog direkt in den Graphen: +* **Intent Detection**: Das Chat-Backend erkennt „notierwürdige“ Informationen (z.B. neue Prinzipien, Strategie-Entwürfe oder Werte-Anpassungen). +* **Auto-Drafting**: Das LLM nutzt das `interview_template`, um aus dem Chat-Fragment eine valide Markdown-Datei mit Frontmatter (Status: `draft`) zu generieren. +* **Real-Time Linking**: Die neue Datei wird sofort dem „Discovery-Lauf“ (Teil A) unterzogen, um sie mit dem bestehenden Wissensschatz zu vernetzen. +* **User Review**: Die generierte Notiz erscheint im `00_Inbox` Ordner. Der Nutzer muss lediglich den Status auf `stable` setzen, um die Entdeckungen final zu integrieren. + +**Erwartete Ergebnisse:** +* Eliminierung von Wissens-Silos durch automatische Vernetzung. +* Nahtloser Übergang von der Exploration (Chat) zur Konsolidierung (Vault). +* Vermeidung von Dubletten durch Ähnlichkeits-Warnungen beim Import. ## 4. Abhängigkeiten & Release-Plan ```mermaid @@ -244,6 +269,8 @@ graph TD WP19a --> WP17(Memory) WP15(Smart Edges) --> WP16(Auto-Discovery) WP15 --> WP14(Refactoring) + WP15(Smart Edges) --> WP15b(Candidate Validation) + WP15b --> WP24(Proactive Discovery) WP03(Import) --> WP18(Health Check) WP03 --> WP13(MCP) WP04 --> WP13(MCP) @@ -253,4 +280,5 @@ graph TD WP22 --> WP14 WP15(Smart Edges) --> WP21 WP20(Cloud Hybrid) --> WP15b + WP24 --> WP23(Multi-Stream Reasoning) ``` \ No newline at end of file diff --git a/docs/06_Roadmap/06_handover_prompts.md b/docs/06_Roadmap/06_handover_prompts.md index 3aab30f..9e7edef 100644 --- a/docs/06_Roadmap/06_handover_prompts.md +++ b/docs/06_Roadmap/06_handover_prompts.md @@ -315,4 +315,45 @@ Die Gewichtung findet **Pre-Retrieval** (im Scoring-Algorithmus) statt, **nicht* 2. Zeige die Integration in `ingestion.py` (Status-Filter & Edge-Validierung). 3. Zeige die Erweiterung in `scoring.py` (Status-Gewicht & Dynamic Edge Boosting). -Bitte bestätige die Übernahme dieses Architektur-Pakets. \ No newline at end of file +Bitte bestätige die Übernahme dieses Architektur-Pakets. + +--- + +# Übergabe Arbeitspaket: WP-24 – Proactive Discovery & Agentic Knowledge Mining + +## 1. Projekt-Kontext +Wir arbeiten an **Mindnet**, einem System für einen "digitalen Zwilling". Das System nutzt einen Wissensgraph (Qdrant), asynchrone Ingestion und eine hybride LLM-Infrastruktur (Cloud/Lokal). + +## 2. Status Quo (Abgeschlossen: WP-15b) +Das Arbeitspaket **WP-15b (Candidate-Based Validation)** wurde gerade erfolgreich implementiert. +* **Two-Pass Workflow:** In Pass 1 wird ein globaler `LocalBatchCache` aufgebaut (ID, Titel, Dateiname). In Pass 2 findet eine semantische binäre Validierung (YES/NO) statt. +* **Edge Inheritance:** Kanten werden aus Sektionen und Frontmatter an Chunks vererbt. +* **Candidate Pool:** Nur Kanten in der Sektion `## Unzugeordnete Kanten` (Provenienz: `global_pool`) werden vom LLM geprüft. Explizite Kanten (`[!edge]` im Text) werden direkt übernommen. + +## 3. Auftrag: WP-24 – Proactive Discovery & Agentic Knowledge Mining +Das Ziel ist die Transformation von Mindnet zu einem aktiven Denkpartner. + +### Teil A: Proactive Discovery (Vault-Scanning) +* **Mechanismus:** Automatisches Befüllen des `candidate_pool` via Vektor-Ähnlichkeit. +* **Logik:** Beim Import einer Note sucht ein neuer Service in Qdrant nach den semantisch ähnlichsten Chunks im Vault und fügt diese als `related_to` Kandidaten hinzu. +* **Filter:** Die WP-15b Validierungs-Logik filtert diese Vorschläge anschließend. + +### Teil B: Agentic Knowledge Mining (Chat-to-Vault) +* **Mechanismus:** Extraktion notierwürdiger Informationen aus dem Chat. +* **Logik:** Erstellung von Markdown-Drafts im `00_Inbox` Ordner basierend auf dem Chat-Kontext unter Nutzung des `interview_template`. + +## 4. Erforderliche Code-Basis (Dateien) +Stelle sicher, dass dir folgende Dateien vorliegen, um die Logik zu verstehen und zu erweitern: + +1. **`app/core/ingestion.py` (v2.12.2):** Zentraler Two-Pass Workflow und Validierungsgate. +2. **`app/core/chunker.py` (v3.2.0):** Vorbereitung des Candidate-Pools und Vererbungslogik. +3. **`scripts/import_markdown.py` (v2.4.1):** Entry-Point und Pre-Scan Harvester für den Cache. +4. **`app/core/derive_edges.py` (v2.1.0):** Aggregator für Kanten mit Provenance-Priorisierung. +5. **`app/services/edge_registry.py` (v0.8.0):** Validierung gegen das Kanten-Vokabular. +6. **`config/prompts.yaml` (v2.6.0):** Enthält die `edge_validation` und `interview_template` Prompts. +7. **`06_active_roadmap.md` (v2.9.0):** Enthält die detaillierte Planung für WP-24. + +## 5. Nächste technische Schritte +1. Entwurf eines `RecommenderService` für die Vektor-Suche in Qdrant. +2. Integration des Services in die `ingestion.py` zur automatischen Befüllung des `candidate_pool`. +3. Erweiterung des Chat-Backends um die "Capture-to-Vault" Funktionalität. \ No newline at end of file From 94e5ebf5770d2bbb10b5dbd4eb98791c3e65c06b Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 10:25:35 +0100 Subject: [PATCH 05/23] WP13b Refactoring ingestion und Chunker --- app/core/chunker.py | 429 ++------------------- app/core/chunking/__init__.py | 0 app/core/chunking/chunking_models.py | 31 ++ app/core/chunking/chunking_parser.py | 74 ++++ app/core/chunking/chunking_propagation.py | 25 ++ app/core/chunking/chunking_strategies.py | 74 ++++ app/core/chunking/chunking_utils.py | 55 +++ app/core/ingestion.py | 376 +----------------- app/core/ingestion/__init__.py | 0 app/core/ingestion/ingestion_db.py | 31 ++ app/core/ingestion/ingestion_processor.py | 152 ++++++++ app/core/ingestion/ingestion_utils.py | 69 ++++ app/core/ingestion/ingestion_validation.py | 53 +++ 13 files changed, 607 insertions(+), 762 deletions(-) create mode 100644 app/core/chunking/__init__.py create mode 100644 app/core/chunking/chunking_models.py create mode 100644 app/core/chunking/chunking_parser.py create mode 100644 app/core/chunking/chunking_propagation.py create mode 100644 app/core/chunking/chunking_strategies.py create mode 100644 app/core/chunking/chunking_utils.py create mode 100644 app/core/ingestion/__init__.py create mode 100644 app/core/ingestion/ingestion_db.py create mode 100644 app/core/ingestion/ingestion_processor.py create mode 100644 app/core/ingestion/ingestion_utils.py create mode 100644 app/core/ingestion/ingestion_validation.py diff --git a/app/core/chunker.py b/app/core/chunker.py index c77a43c..d8ea589 100644 --- a/app/core/chunker.py +++ b/app/core/chunker.py @@ -1,393 +1,36 @@ """ FILE: app/core/chunker.py -DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings). - WP-15b: Implementiert Edge-Inheritance und Candidate-Pool Vorbereitung. - Zentralisiert die Kanten-Vorbereitung für die spätere binäre Validierung. - Bietet volle Unterstützung für Hybrid-Chunking (Strict/Soft/Safety-Net). -VERSION: 3.2.0 +DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher. + WP-14: Modularisierung abgeschlossen. + WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert. + Verwendet neue 'chunking_' Präfixe für Untermodule. +VERSION: 3.3.0 STATUS: Active -DEPENDENCIES: re, math, yaml, pathlib, asyncio, logging """ - -from __future__ import annotations -from dataclasses import dataclass, field -from typing import List, Dict, Optional, Tuple, Any, Set +import asyncio import re -import math -import yaml -from pathlib import Path -import asyncio import logging +from typing import List, Dict, Optional -# Services -# In WP-15b wird die KI-Validierung in die ingestion.py verlagert. -# Wir behalten den Import für Abwärtskompatibilität, falls Legacy-Skripte ihn benötigen. +# Interne Package-Imports mit neuer Präfix-Konvention +from .chunking.chunking_models import Chunk, RawBlock +from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text +from .chunking.chunking_parser import parse_blocks, parse_edges_robust +from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading +from .chunking.chunking_propagation import propagate_section_edges + +logger = logging.getLogger(__name__) + +# Legacy Support für SemanticAnalyzer (Optional für andere Skripte) try: from app.services.semantic_analyzer import get_semantic_analyzer except ImportError: def get_semantic_analyzer(): return None -# Core Imports -try: - from app.core.derive_edges import build_edges_for_note -except ImportError: - # Fallback für Standalone-Betrieb oder Tests - def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return [] - -logger = logging.getLogger(__name__) - -# ========================================== -# 1. HELPER & CONFIG -# ========================================== - -BASE_DIR = Path(__file__).resolve().parent.parent.parent -CONFIG_PATH = BASE_DIR / "config" / "types.yaml" -# Fallback Default, falls types.yaml fehlt -DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)} -_CONFIG_CACHE = None - -def _load_yaml_config() -> Dict[str, Any]: - global _CONFIG_CACHE - if _CONFIG_CACHE is not None: return _CONFIG_CACHE - if not CONFIG_PATH.exists(): return {} - try: - with open(CONFIG_PATH, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) - _CONFIG_CACHE = data - return data - except Exception: return {} - -def get_chunk_config(note_type: str) -> Dict[str, Any]: - """ - Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml. - Sichert die Kompatibilität zu WP-15 Profilen. - """ - full_config = _load_yaml_config() - profiles = full_config.get("chunking_profiles", {}) - type_def = full_config.get("types", {}).get(note_type.lower(), {}) - - # Welches Profil nutzt dieser Typ? (z.B. 'sliding_smart_edges') - profile_name = type_def.get("chunking_profile") - - if not profile_name: - profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard") - - config = profiles.get(profile_name, DEFAULT_PROFILE).copy() - - # Tupel-Konvertierung für Overlap (YAML liest oft Listen) - if "overlap" in config and isinstance(config["overlap"], list): - config["overlap"] = tuple(config["overlap"]) - - return config - -def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]: - """Trennt YAML-Frontmatter vom eigentlichen Text.""" - fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL) - if not fm_match: return {}, md_text - try: - frontmatter = yaml.safe_load(fm_match.group(1)) - if not isinstance(frontmatter, dict): frontmatter = {} - except yaml.YAMLError: - frontmatter = {} - text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL) - return frontmatter, text_without_fm.strip() - -# ========================================== -# 2. DATA CLASSES & TEXT TOOLS -# ========================================== - -_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])') -_WS = re.compile(r'\s+') - -def estimate_tokens(text: str) -> int: - """Grobe Schätzung der Token-Anzahl (4 Zeichen pro Token).""" - return max(1, math.ceil(len(text.strip()) / 4)) - -def split_sentences(text: str) -> list[str]: - """Teilt Text in Sätze auf unter Berücksichtigung von Interpunktion.""" - text = _WS.sub(' ', text.strip()) - if not text: return [] - parts = _SENT_SPLIT.split(text) - return [p.strip() for p in parts if p.strip()] - -@dataclass -class RawBlock: - kind: str - text: str - level: Optional[int] - section_path: str - section_title: Optional[str] - -@dataclass -class Chunk: - id: str - note_id: str - index: int - text: str - window: str - token_count: int - section_title: Optional[str] - section_path: str - neighbors_prev: Optional[str] - neighbors_next: Optional[str] - # WP-15b: Liste von Kandidaten für die semantische Validierung - candidate_pool: List[Dict[str, Any]] = field(default_factory=list) - suggested_edges: Optional[List[str]] = None - -# ========================================== -# 3. PARSING & STRATEGIES -# ========================================== - -def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: - """ - Zerlegt Text in logische Blöcke (Absätze, Header). - Wichtig für die Strategie 'by_heading' und die Edge-Inheritance. - """ - blocks = [] - h1_title = "Dokument" - section_path = "/" - current_h2 = None - - fm, text_without_fm = extract_frontmatter_from_text(md_text) - - h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE) - if h1_match: - h1_title = h1_match.group(1).strip() - - lines = text_without_fm.split('\n') - buffer = [] - - for line in lines: - stripped = line.strip() - if stripped.startswith('# '): - continue - elif stripped.startswith('## '): - if buffer: - content = "\n".join(buffer).strip() - if content: - blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) - buffer = [] - current_h2 = stripped[3:].strip() - section_path = f"/{current_h2}" - blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2)) - elif not stripped: - if buffer: - content = "\n".join(buffer).strip() - if content: - blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) - buffer = [] - else: - buffer.append(line) - - if buffer: - content = "\n".join(buffer).strip() - if content: - blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) - - return blocks, h1_title - -def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]: - """ - Standard-Strategie aus WP-15. - Fasst Blöcke zusammen und schneidet bei 'target' Tokens. - """ - target = config.get("target", 400) - max_tokens = config.get("max", 600) - overlap_val = config.get("overlap", (50, 80)) - overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val - chunks = [] - buf = [] - - def _create_chunk(txt, win, sec, path): - idx = len(chunks) - chunks.append(Chunk( - id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, - text=txt, window=win, token_count=estimate_tokens(txt), - section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None, - candidate_pool=[] - )) - - def flush_buffer(): - nonlocal buf - if not buf: return - - text_body = "\n\n".join([b.text for b in buf]) - sec_title = buf[-1].section_title if buf else None - sec_path = buf[-1].section_path if buf else "/" - win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body - - if estimate_tokens(text_body) <= max_tokens: - _create_chunk(text_body, win_body, sec_title, sec_path) - else: - sentences = split_sentences(text_body) - current_chunk_sents = [] - current_len = 0 - - for sent in sentences: - sent_len = estimate_tokens(sent) - if current_len + sent_len > target and current_chunk_sents: - c_txt = " ".join(current_chunk_sents) - c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt - _create_chunk(c_txt, c_win, sec_title, sec_path) - - overlap_sents = [] - ov_len = 0 - for s in reversed(current_chunk_sents): - if ov_len + estimate_tokens(s) < overlap: - overlap_sents.insert(0, s) - ov_len += estimate_tokens(s) - else: break - - current_chunk_sents = list(overlap_sents) - current_chunk_sents.append(sent) - current_len = ov_len + sent_len - else: - current_chunk_sents.append(sent) - current_len += sent_len - - if current_chunk_sents: - c_txt = " ".join(current_chunk_sents) - c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt - _create_chunk(c_txt, c_win, sec_title, sec_path) - buf = [] - - for b in blocks: - if b.kind == "heading": continue - current_buf_text = "\n\n".join([x.text for x in buf]) - if estimate_tokens(current_buf_text) + estimate_tokens(b.text) >= target: - flush_buffer() - buf.append(b) - if estimate_tokens(b.text) >= target: - flush_buffer() - - flush_buffer() - return chunks - -def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]: - """ - Hybrid-Strategie v2.9 (Strict/Soft/Safety-Net). - """ - strict = config.get("strict_heading_split", False) - target = config.get("target", 400) - max_tokens = config.get("max", 600) - split_level = config.get("split_level", 2) - - chunks = [] - current_buf = [] - current_tokens = 0 - - def _flush(sec_title, sec_path): - nonlocal current_buf, current_tokens - if not current_buf: return - txt = "\n\n".join(current_buf) - win = f"# {doc_title}\n## {sec_title}\n{txt}".strip() if sec_title else txt - idx = len(chunks) - chunks.append(Chunk( - id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, - text=txt, window=win, token_count=estimate_tokens(txt), - section_title=sec_title, section_path=sec_path, - neighbors_prev=None, neighbors_next=None, - candidate_pool=[] - )) - current_buf = [] - current_tokens = 0 - - for b in blocks: - if b.kind == "heading": - # Hierarchie-Check: Split bei Überschriften oberhalb des Split-Levels - if b.level < split_level: - _flush(b.section_title, b.section_path) - elif b.level == split_level: - if strict or current_tokens >= target: - _flush(b.section_title, b.section_path) - continue - - block_tokens = estimate_tokens(b.text) - if current_tokens + block_tokens > max_tokens and current_buf: - _flush(b.section_title, b.section_path) - - current_buf.append(b.text) - current_tokens += block_tokens - - if current_buf: - last = blocks[-1] if blocks else None - _flush(last.section_title if last else None, last.section_path if last else "/") - - return chunks - -# ========================================== -# 4. ROBUST EDGE PARSING & PROPAGATION -# ========================================== - -def _parse_edges_robust(text: str) -> Set[str]: - """ - Findet Kanten im Text (Wikilinks, Inlines, Callouts). - Fix V3: Support für mehrzeilige Callouts. - """ - found_edges = set() - - # A. Inline [[rel:type|target]] - inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text) - for kind, target in inlines: - k = kind.strip().lower() - t = target.strip() - if k and t: found_edges.add(f"{k}:{t}") - - # B. Multiline Callouts Parsing (WP-15 Fix) - lines = text.split('\n') - current_edge_type = None - for line in lines: - stripped = line.strip() - callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped) - if callout_match: - current_edge_type = callout_match.group(1).strip().lower() - links = re.findall(r'\[\[([^\]]+)\]\]', stripped) - for l in links: - if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") - continue - - if current_edge_type and stripped.startswith('>'): - links = re.findall(r'\[\[([^\]]+)\]\]', stripped) - for l in links: - if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") - elif not stripped.startswith('>'): - current_edge_type = None - - return found_edges - -def _propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]: - """ - WP-15b: Implementiert Edge-Inheritance. - Kanten aus Überschriften werden an untergeordnete Chunks vererbt. - """ - section_inheritance: Dict[str, Set[str]] = {} - - # 1. Sammeln aus den Heading-Blöcken - for b in blocks: - if b.kind == "heading": - edges = _parse_edges_robust(b.text) - if edges: - if b.section_path not in section_inheritance: - section_inheritance[b.section_path] = set() - section_inheritance[b.section_path].update(edges) - - # 2. Injektion in den Candidate-Pool - for ch in chunks: - inherited = section_inheritance.get(ch.section_path, set()) - for e_str in inherited: - kind, target = e_str.split(':', 1) - ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"}) - - return chunks - -# ========================================== -# 5. ORCHESTRATION (WP-15b) -# ========================================== - async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: """ - Hauptfunktion zur Chunk-Generierung. - Baut den Candidate-Pool für die semantische Validierung auf. + Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten. + Sichert die Kompatibilität zum bestehenden Ingestion-Prozess. """ if config is None: config = get_chunk_config(note_type) @@ -395,51 +38,47 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op fm, body_text = extract_frontmatter_from_text(md_text) primary_strategy = config.get("strategy", "sliding_window") - # 1. Parsing & Splitting + # 1. Parsing blocks, doc_title = parse_blocks(md_text) + # 2. Splitting via Thread-Offloading if primary_strategy == "by_heading": - chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title) + chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) else: - chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title) + chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) if not chunks: return [] - # 2. WP-15b: Candidate Pool Vorbereitung - + # 3. WP-15b: Candidate Pool Vorbereitung # A. Edge Inheritance (Sektions-Propagation) - chunks = _propagate_section_edges(chunks, blocks) + chunks = propagate_section_edges(chunks, blocks) - # B. Explicit Edges (Direkt im Chunk-Text enthalten) + # B. Explicit Edges (Direkt im Chunk-Text) for ch in chunks: - explicit = _parse_edges_robust(ch.text) + explicit = parse_edges_robust(ch.text) for e_str in explicit: kind, target = e_str.split(':', 1) ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"}) - # C. Global "Unassigned Pool" Detection (Safety Net) - # Sucht nach einer Sektion "Unzugeordnete Kanten" im Body - unassigned_pool = set() + # C. Global Pool Detection (Sektion 'Unzugeordnete Kanten') pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) if pool_match: - unassigned_pool = _parse_edges_robust(pool_match.group(1)) + unassigned = parse_edges_robust(pool_match.group(1)) for ch in chunks: - for e_str in unassigned_pool: + for e_str in unassigned: kind, target = e_str.split(':', 1) ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"}) - # D. De-Duplikation des Pools + # D. Eindeutigkeit sicherstellen for ch in chunks: - seen = set() - unique_pool = [] + seen = set(); unique_pool = [] for cand in ch.candidate_pool: key = (cand["kind"], cand["to"]) if key not in seen: - seen.add(key) - unique_pool.append(cand) + seen.add(key); unique_pool.append(cand) ch.candidate_pool = unique_pool - # 3. Nachbarschafts-Verkettung (Struktur-Kanten) + # 4. Graph-Struktur (Nachbarschaft) for i, ch in enumerate(chunks): ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None diff --git a/app/core/chunking/__init__.py b/app/core/chunking/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/chunking/chunking_models.py b/app/core/chunking/chunking_models.py new file mode 100644 index 0000000..d64c4e7 --- /dev/null +++ b/app/core/chunking/chunking_models.py @@ -0,0 +1,31 @@ +""" +FILE: app/core/chunking/chunking_models.py +DESCRIPTION: Datenklassen für das Chunking-System. +""" +from dataclasses import dataclass, field +from typing import List, Dict, Optional, Any + +@dataclass +class RawBlock: + """Repräsentiert einen logischen Block aus dem Markdown-Parsing.""" + kind: str + text: str + level: Optional[int] + section_path: str + section_title: Optional[str] + +@dataclass +class Chunk: + """Das finale Chunk-Objekt für Embedding und Graph-Speicherung.""" + id: str + note_id: str + index: int + text: str + window: str + token_count: int + section_title: Optional[str] + section_path: str + neighbors_prev: Optional[str] + neighbors_next: Optional[str] + candidate_pool: List[Dict[str, Any]] = field(default_factory=list) + suggested_edges: Optional[List[str]] = None \ No newline at end of file diff --git a/app/core/chunking/chunking_parser.py b/app/core/chunking/chunking_parser.py new file mode 100644 index 0000000..0524484 --- /dev/null +++ b/app/core/chunking/chunking_parser.py @@ -0,0 +1,74 @@ +""" +FILE: app/core/chunking/chunking_parser.py +DESCRIPTION: Zerlegt Markdown in Blöcke und extrahiert Kanten-Strings. +""" +import re +from typing import List, Tuple, Set +from .chunking_models import RawBlock +from .chunking_utils import extract_frontmatter_from_text + +_WS = re.compile(r'\s+') +_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])') + +def split_sentences(text: str) -> list[str]: + """Teilt Text in Sätze auf.""" + text = _WS.sub(' ', text.strip()) + if not text: return [] + return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] + +def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: + """Zerlegt Text in logische Einheiten.""" + blocks = [] + h1_title = "Dokument"; section_path = "/"; current_h2 = None + fm, text_without_fm = extract_frontmatter_from_text(md_text) + h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE) + if h1_match: h1_title = h1_match.group(1).strip() + lines = text_without_fm.split('\n') + buffer = [] + for line in lines: + stripped = line.strip() + if stripped.startswith('# '): continue + elif stripped.startswith('## '): + if buffer: + content = "\n".join(buffer).strip() + if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) + buffer = [] + current_h2 = stripped[3:].strip() + section_path = f"/{current_h2}" + blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2)) + elif not stripped: + if buffer: + content = "\n".join(buffer).strip() + if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) + buffer = [] + else: buffer.append(line) + if buffer: + content = "\n".join(buffer).strip() + if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) + return blocks, h1_title + +def parse_edges_robust(text: str) -> Set[str]: + """Extrahiert Kanten-Kandidaten (Wikilinks, Callouts).""" + found_edges = set() + inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text) + for kind, target in inlines: + k = kind.strip().lower() + t = target.strip() + if k and t: found_edges.add(f"{k}:{t}") + lines = text.split('\n') + current_edge_type = None + for line in lines: + stripped = line.strip() + callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped) + if callout_match: + current_edge_type = callout_match.group(1).strip().lower() + links = re.findall(r'\[\[([^\]]+)\]\]', stripped) + for l in links: + if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") + continue + if current_edge_type and stripped.startswith('>'): + links = re.findall(r'\[\[([^\]]+)\]\]', stripped) + for l in links: + if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}") + elif not stripped.startswith('>'): current_edge_type = None + return found_edges \ No newline at end of file diff --git a/app/core/chunking/chunking_propagation.py b/app/core/chunking/chunking_propagation.py new file mode 100644 index 0000000..1aeb361 --- /dev/null +++ b/app/core/chunking/chunking_propagation.py @@ -0,0 +1,25 @@ +""" +FILE: app/core/chunking/chunking_propagation.py +DESCRIPTION: Vererbung von Kanten (Inheritance) über Sektions-Pfade. +""" +from typing import List, Dict, Set +from .chunking_models import Chunk, RawBlock +from .chunking_parser import parse_edges_robust + +def propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]: + """WP-15b: Kanten aus Headings werden an Sub-Chunks vererbt.""" + section_inheritance: Dict[str, Set[str]] = {} + for b in blocks: + if b.kind == "heading": + edges = parse_edges_robust(b.text) + if edges: + if b.section_path not in section_inheritance: + section_inheritance[b.section_path] = set() + section_inheritance[b.section_path].update(edges) + + for ch in chunks: + inherited = section_inheritance.get(ch.section_path, set()) + for e_str in inherited: + kind, target = e_str.split(':', 1) + ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"}) + return chunks \ No newline at end of file diff --git a/app/core/chunking/chunking_strategies.py b/app/core/chunking/chunking_strategies.py new file mode 100644 index 0000000..7684bd5 --- /dev/null +++ b/app/core/chunking/chunking_strategies.py @@ -0,0 +1,74 @@ +""" +FILE: app/core/chunking/chunking_strategies.py +DESCRIPTION: Implementierung der mathematischen Splitting-Strategien. +""" +from typing import List, Dict, Any +from .chunking_models import RawBlock, Chunk +from .chunking_utils import estimate_tokens +from .chunking_parser import split_sentences + +def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]: + """Fasst Blöcke zusammen und schneidet bei 'target' Tokens.""" + target = config.get("target", 400); max_tokens = config.get("max", 600) + overlap_val = config.get("overlap", (50, 80)) + overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val + chunks = []; buf = [] + + def _add(txt, sec, path): + idx = len(chunks); win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt + chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None)) + + def flush(): + nonlocal buf + if not buf: return + text_body = "\n\n".join([b.text for b in buf]) + sec_title = buf[-1].section_title; sec_path = buf[-1].section_path + if estimate_tokens(text_body) <= max_tokens: _add(text_body, sec_title, sec_path) + else: + sents = split_sentences(text_body); cur_sents = []; cur_len = 0 + for s in sents: + slen = estimate_tokens(s) + if cur_len + slen > target and cur_sents: + _add(" ".join(cur_sents), sec_title, sec_path) + ov_s = []; ov_l = 0 + for os in reversed(cur_sents): + if ov_l + estimate_tokens(os) < overlap: ov_s.insert(0, os); ov_l += estimate_tokens(os) + else: break + cur_sents = list(ov_s); cur_sents.append(s); cur_len = ov_l + slen + else: cur_sents.append(s); cur_len += slen + if cur_sents: _add(" ".join(cur_sents), sec_title, sec_path) + buf = [] + + for b in blocks: + if b.kind == "heading": continue + if estimate_tokens("\n\n".join([x.text for x in buf])) + estimate_tokens(b.text) >= target: flush() + buf.append(b) + if estimate_tokens(b.text) >= target: flush() + flush() + return chunks + +def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]: + """Splittet Text basierend auf Markdown-Überschriften.""" + strict = config.get("strict_heading_split", False); target = config.get("target", 400) + max_tokens = config.get("max", 600); split_level = config.get("split_level", 2) + chunks = []; buf = []; cur_tokens = 0 + + def _flush(title, path): + nonlocal buf, cur_tokens + if not buf: return + txt = "\n\n".join(buf); win = f"# {doc_title}\n## {title}\n{txt}".strip() if title else txt + idx = len(chunks) + chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None)) + buf = []; cur_tokens = 0 + + for b in blocks: + if b.kind == "heading": + if b.level < split_level: _flush(b.section_title, b.section_path) + elif b.level == split_level: + if strict or cur_tokens >= target: _flush(b.section_title, b.section_path) + continue + bt = estimate_tokens(b.text) + if cur_tokens + bt > max_tokens and buf: _flush(b.section_title, b.section_path) + buf.append(b.text); cur_tokens += bt + if buf: _flush(blocks[-1].section_title if blocks else None, blocks[-1].section_path if blocks else "/") + return chunks \ No newline at end of file diff --git a/app/core/chunking/chunking_utils.py b/app/core/chunking/chunking_utils.py new file mode 100644 index 0000000..da812aa --- /dev/null +++ b/app/core/chunking/chunking_utils.py @@ -0,0 +1,55 @@ +""" +FILE: app/core/chunking/chunking_utils.py +DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration. +""" +import math +import yaml +import logging +from pathlib import Path +from typing import Dict, Any, Tuple + +logger = logging.getLogger(__name__) + +BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent +CONFIG_PATH = BASE_DIR / "config" / "types.yaml" +DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)} + +_CONFIG_CACHE = None + +def load_yaml_config() -> Dict[str, Any]: + global _CONFIG_CACHE + if _CONFIG_CACHE is not None: return _CONFIG_CACHE + if not CONFIG_PATH.exists(): return {} + try: + with open(CONFIG_PATH, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + _CONFIG_CACHE = data + return data + except Exception: return {} + +def get_chunk_config(note_type: str) -> Dict[str, Any]: + """Lädt die Chunking-Strategie basierend auf dem Note-Type.""" + full_config = load_yaml_config() + profiles = full_config.get("chunking_profiles", {}) + type_def = full_config.get("types", {}).get(note_type.lower(), {}) + profile_name = type_def.get("chunking_profile") or full_config.get("defaults", {}).get("chunking_profile", "sliding_standard") + config = profiles.get(profile_name, DEFAULT_PROFILE).copy() + if "overlap" in config and isinstance(config["overlap"], list): + config["overlap"] = tuple(config["overlap"]) + return config + +def estimate_tokens(text: str) -> int: + """Grobe Schätzung der Token-Anzahl.""" + return max(1, math.ceil(len(text.strip()) / 4)) + +def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]: + """Trennt YAML-Frontmatter vom Text.""" + import re + fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL) + if not fm_match: return {}, md_text + try: + frontmatter = yaml.safe_load(fm_match.group(1)) + if not isinstance(frontmatter, dict): frontmatter = {} + except Exception: frontmatter = {} + text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL) + return frontmatter, text_without_fm.strip() \ No newline at end of file diff --git a/app/core/ingestion.py b/app/core/ingestion.py index a5a80d8..a140178 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -1,373 +1,15 @@ """ FILE: app/core/ingestion.py -DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen. - WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free). - WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash. - WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation. - Sichert, dass explizite Kanten direkt übernommen und nur Pool-Kanten validiert werden. -FIX: Deep Fallback Logic (v2.11.14) für JSON-Recovery. - Robust Lookup Fix: Adressiert Notizen im Cache via ID, Titel und Dateiname. -VERSION: 2.12.2 +DESCRIPTION: Facade für das Ingestion-Package. Stellt 100% Abwärtskompatibilität sicher. + WP-14: Modularisierung der Ingestion-Pipeline abgeschlossen. + Nutzt interne Module mit 'ingestion_' Präfix für maximale Wartbarkeit. +VERSION: 2.13.0 STATUS: Active -DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, - app.services.llm_service, app.services.edge_registry """ -import os -import json -import re -import logging -import asyncio -import time -from typing import Dict, List, Optional, Tuple, Any +# Export der Hauptklasse für externe Module (z.B. scripts/import_markdown.py) +from .ingestion.ingestion_processor import IngestionService -# Core Module Imports -from app.core.parser import ( - read_markdown, - pre_scan_markdown, - normalize_frontmatter, - validate_required_frontmatter, - extract_edges_with_context, - NoteContext -) -from app.core.note_payload import make_note_payload -from app.core.chunker import assemble_chunks, get_chunk_config -from app.core.chunk_payload import make_chunk_payloads +# Export der Hilfsfunktionen für Abwärtskompatibilität +from .ingestion.ingestion_utils import extract_json_from_response, load_type_registry -# Fallback für Edges -try: - from app.core.derive_edges import build_edges_for_note -except ImportError: - def build_edges_for_note(*args, **kwargs): return [] - -from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes -from app.core.qdrant_points import ( - points_for_chunks, - points_for_note, - points_for_edges, - upsert_batch, -) - -from app.services.embeddings_client import EmbeddingsClient -from app.services.edge_registry import registry as edge_registry -from app.services.llm_service import LLMService - -logger = logging.getLogger(__name__) - -# --- Global Helpers (Full Compatibility v2.11.14) --- -def extract_json_from_response(text: str) -> Any: - """ - Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama). - Entfernt , [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit. - """ - if not text or not isinstance(text, str): - return [] - - # 1. Entferne Mistral/Llama Steuerzeichen und Tags - clean = text.replace("", "").replace("", "") - clean = clean.replace("[OUT]", "").replace("[/OUT]", "") - clean = clean.strip() - - # 2. Suche nach Markdown JSON-Blöcken (```json ... ```) - match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) - payload = match.group(1) if match else clean - - try: - return json.loads(payload.strip()) - except json.JSONDecodeError: - # 3. Recovery: Suche nach der ersten [ und letzten ] (Liste) - start = payload.find('[') - end = payload.rfind(']') + 1 - if start != -1 and end > start: - try: - return json.loads(payload[start:end]) - except: pass - - # 4. Zweite Recovery: Suche nach der ersten { und letzten } (Objekt) - start_obj = payload.find('{') - end_obj = payload.rfind('}') + 1 - if start_obj != -1 and end_obj > start_obj: - try: - return json.loads(payload[start_obj:end_obj]) - except: pass - - return [] - -def load_type_registry(custom_path: Optional[str] = None) -> dict: - """Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion.""" - import yaml - from app.config import get_settings - settings = get_settings() - path = custom_path or settings.MINDNET_TYPES_FILE - if not os.path.exists(path): return {} - try: - with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} - except Exception: return {} - -# --- Service Class --- -class IngestionService: - def __init__(self, collection_prefix: str = None): - from app.config import get_settings - self.settings = get_settings() - - self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX - self.cfg = QdrantConfig.from_env() - self.cfg.prefix = self.prefix - self.client = get_client(self.cfg) - self.dim = self.settings.VECTOR_SIZE - self.registry = load_type_registry() - self.embedder = EmbeddingsClient() - self.llm = LLMService() - - self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE - self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache - - try: - ensure_collections(self.client, self.prefix, self.dim) - ensure_payload_indexes(self.client, self.prefix) - except Exception as e: - logger.warning(f"DB init warning: {e}") - - async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]: - """ - WP-15b: Implementiert den Two-Pass Ingestion Workflow. - Pass 1: Pre-Scan baut flüchtigen Kontext-Cache auf. - Pass 2: Processing führt die eigentliche semantische Validierung durch. - """ - logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") - for path in file_paths: - ctx = pre_scan_markdown(path) - if ctx: - # Mehrfache Indizierung für robusten Look-up (WP-15b Fix) - self.batch_cache[ctx.note_id] = ctx - self.batch_cache[ctx.title] = ctx - # Dateiname ohne Endung als dritter Schlüssel - fname = os.path.splitext(os.path.basename(path))[0] - self.batch_cache[fname] = ctx - - logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...") - results = [] - for path in file_paths: - res = await self.process_file(path, vault_root, apply=True) - results.append(res) - return results - - async def _validate_candidate(self, chunk_text: str, edge: Dict) -> bool: - """ - WP-15b: Validiert einen Kanten-Kandidaten semantisch gegen das Ziel. - Nutzt den Cache aus Pass 1, um dem LLM Kontext der Ziel-Note zu geben. - """ - target_id = edge.get("to") - target_ctx = self.batch_cache.get(target_id) - - # Fallback Look-up für Links mit Ankern (Anchor entfernen) - if not target_ctx and "#" in target_id: - base_id = target_id.split("#")[0] - target_ctx = self.batch_cache.get(base_id) - - # Sicherheits-Fallback: Wenn Zielnotiz nicht im aktuellen Batch ist, - # lassen wir die Kante als 'explicit' durch (Hard-Link Integrity). - if not target_ctx: - logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.") - return True - - provider = self.settings.MINDNET_LLM_PROVIDER - template = self.llm.get_prompt("edge_validation", provider) - - try: - logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...") - prompt = template.format( - chunk_text=chunk_text[:1500], - target_title=target_ctx.title, - target_summary=target_ctx.summary, - edge_kind=edge.get("kind", "related_to") - ) - - response = await self.llm.generate_raw_response(prompt, priority="background") - is_valid = "YES" in response.upper() - - if is_valid: - logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.") - else: - logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.") - - return is_valid - except Exception as e: - logger.warning(f"⚠️ Semantic validation error for {target_id}: {e}") - return True # Fallback: Im Zweifel Link behalten - - def _resolve_note_type(self, requested: Optional[str]) -> str: - """Bestimmt den finalen Notiz-Typ (Fallback auf 'concept').""" - types = self.registry.get("types", {}) - if requested and requested in types: return requested - return "concept" - - def _get_chunk_config_by_profile(self, profile_name: str, note_type: str) -> Dict[str, Any]: - """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" - profiles = self.registry.get("chunking_profiles", {}) - if profile_name in profiles: - cfg = profiles[profile_name].copy() - if "overlap" in cfg and isinstance(cfg["overlap"], list): - cfg["overlap"] = tuple(cfg["overlap"]) - return cfg - return get_chunk_config(note_type) - - async def process_file( - self, file_path: str, vault_root: str, - force_replace: bool = False, apply: bool = False, purge_before: bool = False, - note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical" - ) -> Dict[str, Any]: - """Transformiert eine Markdown-Datei in den Graphen.""" - result = {"path": file_path, "status": "skipped", "changed": False, "error": None} - - # 1. Parse & Lifecycle Gate - try: - parsed = read_markdown(file_path) - if not parsed: return {**result, "error": "Empty file"} - fm = normalize_frontmatter(parsed.frontmatter) - validate_required_frontmatter(fm) - except Exception as e: - return {**result, "error": f"Validation failed: {str(e)}"} - - # Lifecycle Filter (WP-22) - status = fm.get("status", "draft").lower().strip() - if status in ["system", "template", "archive", "hidden"]: - return {**result, "status": "skipped", "reason": f"lifecycle_{status}"} - - # 2. Config Resolution & Payload - note_type = self._resolve_note_type(fm.get("type")) - fm["type"] = note_type - - try: - note_pl = make_note_payload(parsed, vault_root=vault_root, hash_normalize=hash_normalize, hash_source=hash_source, file_path=file_path) - note_id = note_pl["note_id"] - except Exception as e: - return {**result, "error": f"Payload failed: {str(e)}"} - - # 3. Change Detection (v2.11.14 Logic) - old_payload = None if force_replace else self._fetch_note_payload(note_id) - check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}" - old_hash = (old_payload or {}).get("hashes", {}).get(check_key) - new_hash = note_pl.get("hashes", {}).get(check_key) - - chunks_missing, edges_missing = self._artifacts_missing(note_id) - should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing - - if not should_write: - return {**result, "status": "unchanged", "note_id": note_id} - - if not apply: - return {**result, "status": "dry-run", "changed": True, "note_id": note_id} - - # 4. Processing (Chunking, Embedding, Validated Edges) - try: - body_text = getattr(parsed, "body", "") or "" - edge_registry.ensure_latest() - - # Chunker Resolution - profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard" - chunk_cfg = self._get_chunk_config_by_profile(profile, note_type) - enable_smart_edges = chunk_cfg.get("enable_smart_edge_allocation", False) - - # WP-15b: Chunker bereitet nun den Candidate-Pool vor (inkl. Inheritance). - chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg) - - # WP-15b: Validierung NUR für Kandidaten aus dem global_pool (Unzugeordnete Kanten) - for ch_obj in chunks: - filtered_pool = [] - for cand in getattr(ch_obj, "candidate_pool", []): - # Nur 'global_pool' erfordert LLM-Validierung. - # 'explicit' und 'inherited' werden direkt akzeptiert. - if cand.get("provenance") == "global_pool" and enable_smart_edges: - if await self._validate_candidate(ch_obj.text, cand): - filtered_pool.append(cand) - else: - filtered_pool.append(cand) - ch_obj.candidate_pool = filtered_pool - - chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) - - # Embeddings generieren - vecs = [] - if chunk_pls: - texts = [c.get("window") or c.get("text") or "" for c in chunk_pls] - vecs = await self.embedder.embed_documents(texts) - - # Kanten finalisieren via derive_edges Aggregator (WP-15b kompatibel) - # Nutzt das Provenance-Ranking (v2.1.0). - edges = build_edges_for_note( - note_id, - chunk_pls, - note_level_references=note_pl.get("references", []), - include_note_scope_refs=note_scope_refs - ) - - # Alias-Auflösung & Registry Enforcement - context = {"file": file_path, "note_id": note_id} - for e in edges: - e["kind"] = edge_registry.resolve( - edge_type=e.get("kind", "related_to"), - provenance=e.get("provenance", "explicit"), - context={**context, "line": e.get("line", "system")} - ) - - except Exception as e: - logger.error(f"Processing failed for {file_path}: {e}", exc_info=True) - return {**result, "error": f"Processing failed: {str(e)}"} - - # 5. DB Upsert - try: - if purge_before and old_payload: self._purge_artifacts(note_id) - - n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim) - upsert_batch(self.client, n_name, n_pts) - - if chunk_pls and vecs: - # v2.11.14 Points-Extraction Logic - c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1] - upsert_batch(self.client, f"{self.prefix}_chunks", c_pts) - - if edges: - # v2.11.14 Points-Extraction Logic - e_pts = points_for_edges(self.prefix, edges)[1] - upsert_batch(self.client, f"{self.prefix}_edges", e_pts) - - return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)} - except Exception as e: - return {**result, "error": f"DB Upsert failed: {e}"} - - def _fetch_note_payload(self, note_id: str) -> Optional[dict]: - """Holt die Metadaten einer Note aus Qdrant.""" - from qdrant_client.http import models as rest - try: - f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - pts, _ = self.client.scroll(collection_name=f"{self.prefix}_notes", scroll_filter=f, limit=1, with_payload=True) - return pts[0].payload if pts else None - except: return None - - def _artifacts_missing(self, note_id: str) -> Tuple[bool, bool]: - """Prüft Qdrant aktiv auf vorhandene Chunks und Edges.""" - from qdrant_client.http import models as rest - try: - f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - c_pts, _ = self.client.scroll(collection_name=f"{self.prefix}_chunks", scroll_filter=f, limit=1) - e_pts, _ = self.client.scroll(collection_name=f"{self.prefix}_edges", scroll_filter=f, limit=1) - return (not bool(c_pts)), (not bool(e_pts)) - except: return True, True - - def _purge_artifacts(self, note_id: str): - """Löscht verwaiste Chunks/Edges vor einem Re-Import.""" - from qdrant_client.http import models as rest - f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - for suffix in ["chunks", "edges"]: - try: self.client.delete(collection_name=f"{self.prefix}_{suffix}", points_selector=rest.FilterSelector(filter=f)) - except: pass - - async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]: - """Hilfsmethode zur Erstellung einer Note aus einem Textstream.""" - target_dir = os.path.join(vault_root, folder) - os.makedirs(target_dir, exist_ok=True) - file_path = os.path.join(target_dir, filename) - with open(file_path, "w", encoding="utf-8") as f: - f.write(markdown_content) - await asyncio.sleep(0.1) - return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True) \ No newline at end of file +__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"] \ No newline at end of file diff --git a/app/core/ingestion/__init__.py b/app/core/ingestion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/ingestion/ingestion_db.py b/app/core/ingestion/ingestion_db.py new file mode 100644 index 0000000..9acf096 --- /dev/null +++ b/app/core/ingestion/ingestion_db.py @@ -0,0 +1,31 @@ +""" +FILE: app/core/ingestion/ingestion_db.py +DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung. +""" +from typing import Optional, Tuple +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest + +def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]: + """Holt die Metadaten einer Note aus Qdrant via Scroll.""" + try: + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + pts, _ = client.scroll(collection_name=f"{prefix}_notes", scroll_filter=f, limit=1, with_payload=True) + return pts[0].payload if pts else None + except: return None + +def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]: + """Prüft Qdrant aktiv auf vorhandene Chunks und Edges.""" + try: + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + c_pts, _ = client.scroll(collection_name=f"{prefix}_chunks", scroll_filter=f, limit=1) + e_pts, _ = client.scroll(collection_name=f"{prefix}_edges", scroll_filter=f, limit=1) + return (not bool(c_pts)), (not bool(e_pts)) + except: return True, True + +def purge_artifacts(client: QdrantClient, prefix: str, note_id: str): + """Löscht verwaiste Chunks/Edges vor einem Re-Import.""" + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + for suffix in ["chunks", "edges"]: + try: client.delete(collection_name=f"{prefix}_{suffix}", points_selector=rest.FilterSelector(filter=f)) + except: pass \ No newline at end of file diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py new file mode 100644 index 0000000..06c292d --- /dev/null +++ b/app/core/ingestion/ingestion_processor.py @@ -0,0 +1,152 @@ +""" +FILE: app/core/ingestion/ingestion_processor.py +DESCRIPTION: Orchestriert den Ingestion-Prozess (Parsing -> Chunking -> Validierung -> DB). +""" +import logging +import asyncio +from typing import Dict, List, Optional, Tuple, Any + +from app.core.parser import ( + read_markdown, pre_scan_markdown, normalize_frontmatter, + validate_required_frontmatter, NoteContext +) +from app.core.note_payload import make_note_payload +from app.core.chunker import assemble_chunks +from app.core.chunk_payload import make_chunk_payloads +from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes +from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch + +from app.services.embeddings_client import EmbeddingsClient +from app.services.edge_registry import registry as edge_registry +from app.services.llm_service import LLMService + +# Package-Interne Imports +from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile +from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts +from .ingestion_validation import validate_edge_candidate + +# Fallback für Edges +try: + from app.core.derive_edges import build_edges_for_note +except ImportError: + def build_edges_for_note(*args, **kwargs): return [] + +logger = logging.getLogger(__name__) + +class IngestionService: + def __init__(self, collection_prefix: str = None): + from app.config import get_settings + self.settings = get_settings() + self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX + self.cfg = QdrantConfig.from_env() + self.cfg.prefix = self.prefix + self.client = get_client(self.cfg) + self.dim = self.settings.VECTOR_SIZE + self.registry = load_type_registry() + self.embedder = EmbeddingsClient() + self.llm = LLMService() + self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE + self.batch_cache: Dict[str, NoteContext] = {} + + try: + ensure_collections(self.client, self.prefix, self.dim) + ensure_payload_indexes(self.client, self.prefix) + except Exception as e: logger.warning(f"DB init warning: {e}") + + async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]: + """WP-15b: Two-Pass Ingestion Workflow.""" + logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") + for path in file_paths: + ctx = pre_scan_markdown(path) + if ctx: + self.batch_cache[ctx.note_id] = ctx + self.batch_cache[ctx.title] = ctx + import os + fname = os.path.splitext(os.path.basename(path))[0] + self.batch_cache[fname] = ctx + + logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...") + return [await self.process_file(p, vault_root, apply=True) for p in file_paths] + + async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]: + """Transformiert eine Markdown-Datei in den Graphen.""" + apply = kwargs.get("apply", False) + force_replace = kwargs.get("force_replace", False) + purge_before = kwargs.get("purge_before", False) + hash_source = kwargs.get("hash_source", "parsed") + hash_normalize = kwargs.get("hash_normalize", "canonical") + + result = {"path": file_path, "status": "skipped", "changed": False, "error": None} + + # 1. Parse & Lifecycle + try: + parsed = read_markdown(file_path) + if not parsed: return {**result, "error": "Empty file"} + fm = normalize_frontmatter(parsed.frontmatter) + validate_required_frontmatter(fm) + except Exception as e: return {**result, "error": f"Validation failed: {str(e)}"} + + if fm.get("status", "draft").lower().strip() in ["system", "template", "archive", "hidden"]: + return {**result, "status": "skipped", "reason": "lifecycle_filter"} + + # 2. Payload & Change Detection + note_type = resolve_note_type(self.registry, fm.get("type")) + note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, hash_source=hash_source, hash_normalize=hash_normalize) + note_id = note_pl["note_id"] + + old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id) + check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}" + old_hash = (old_payload or {}).get("hashes", {}).get(check_key) + new_hash = note_pl.get("hashes", {}).get(check_key) + + c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id) + if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss): + return {**result, "status": "unchanged", "note_id": note_id} + + if not apply: return {**result, "status": "dry-run", "changed": True, "note_id": note_id} + + # 3. Processing + try: + body_text = getattr(parsed, "body", "") or "" + edge_registry.ensure_latest() + profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard" + chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type) + enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False) + + chunks = await assemble_chunks(fm["id"], body_text, note_type, config=chunk_cfg) + for ch in chunks: + filtered = [] + for cand in getattr(ch, "candidate_pool", []): + if cand.get("provenance") == "global_pool" and enable_smart: + if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER): + filtered.append(cand) + else: filtered.append(cand) + ch.candidate_pool = filtered + + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) + vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else [] + + edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", [])) + for e in edges: + e["kind"] = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"), context={"file": file_path, "note_id": note_id}) + + # 4. DB Upsert + if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id) + n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim) + upsert_batch(self.client, n_name, n_pts) + if chunk_pls and vecs: upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1]) + if edges: upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, edges)[1]) + + return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)} + except Exception as e: + logger.error(f"Processing failed: {e}", exc_info=True) + return {**result, "error": str(e)} + + async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]: + import os + target_dir = os.path.join(vault_root, folder) + os.makedirs(target_dir, exist_ok=True) + file_path = os.path.join(target_dir, filename) + with open(file_path, "w", encoding="utf-8") as f: f.write(markdown_content) + await asyncio.sleep(0.1) + return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True) \ No newline at end of file diff --git a/app/core/ingestion/ingestion_utils.py b/app/core/ingestion/ingestion_utils.py new file mode 100644 index 0000000..dadba30 --- /dev/null +++ b/app/core/ingestion/ingestion_utils.py @@ -0,0 +1,69 @@ +""" +FILE: app/core/ingestion/ingestion_utils.py +DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups. +""" +import os +import json +import re +import yaml +from typing import Any, Optional, Dict + +def extract_json_from_response(text: str) -> Any: + """ + Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic). + Entfernt , [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit. + """ + if not text or not isinstance(text, str): + return [] + + clean = text.replace("", "").replace("", "") + clean = clean.replace("[OUT]", "").replace("[/OUT]", "") + clean = clean.strip() + + match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) + payload = match.group(1) if match else clean + + try: + return json.loads(payload.strip()) + except json.JSONDecodeError: + # Recovery: Suche nach Liste + start = payload.find('[') + end = payload.rfind(']') + 1 + if start != -1 and end > start: + try: return json.loads(payload[start:end]) + except: pass + + # Recovery: Suche nach Objekt + start_obj = payload.find('{') + end_obj = payload.rfind('}') + 1 + if start_obj != -1 and end_obj > start_obj: + try: return json.loads(payload[start_obj:end_obj]) + except: pass + return [] + +def load_type_registry(custom_path: Optional[str] = None) -> dict: + """Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion.""" + from app.config import get_settings + settings = get_settings() + path = custom_path or settings.MINDNET_TYPES_FILE + if not os.path.exists(path): return {} + try: + with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} + except Exception: return {} + +def resolve_note_type(registry: dict, requested: Optional[str]) -> str: + """Bestimmt den finalen Notiz-Typ (Fallback auf 'concept').""" + types = registry.get("types", {}) + if requested and requested in types: return requested + return "concept" + +def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: + """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" + from app.core.chunker import get_chunk_config + profiles = registry.get("chunking_profiles", {}) + if profile_name in profiles: + cfg = profiles[profile_name].copy() + if "overlap" in cfg and isinstance(cfg["overlap"], list): + cfg["overlap"] = tuple(cfg["overlap"]) + return cfg + return get_chunk_config(note_type) \ No newline at end of file diff --git a/app/core/ingestion/ingestion_validation.py b/app/core/ingestion/ingestion_validation.py new file mode 100644 index 0000000..038eebf --- /dev/null +++ b/app/core/ingestion/ingestion_validation.py @@ -0,0 +1,53 @@ +""" +FILE: app/core/ingestion/ingestion_validation.py +DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache. +""" +import logging +from typing import Dict, Any +from app.core.parser import NoteContext + +logger = logging.getLogger(__name__) + +async def validate_edge_candidate( + chunk_text: str, + edge: Dict, + batch_cache: Dict[str, NoteContext], + llm_service: Any, + provider: str +) -> bool: + """WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.""" + target_id = edge.get("to") + target_ctx = batch_cache.get(target_id) + + # Robust Lookup Fix (v2.12.2): Support für Anker + if not target_ctx and "#" in target_id: + base_id = target_id.split("#")[0] + target_ctx = batch_cache.get(base_id) + + # Sicherheits-Fallback (Hard-Link Integrity) + if not target_ctx: + logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.") + return True + + template = llm_service.get_prompt("edge_validation", provider) + + try: + logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...") + prompt = template.format( + chunk_text=chunk_text[:1500], + target_title=target_ctx.title, + target_summary=target_ctx.summary, + edge_kind=edge.get("kind", "related_to") + ) + + response = await llm_service.generate_raw_response(prompt, priority="background") + is_valid = "YES" in response.upper() + + if is_valid: + logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.") + else: + logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.") + return is_valid + except Exception as e: + logger.warning(f"⚠️ Validation error for {target_id}: {e}") + return True \ No newline at end of file From 1b7b8091a3849621576e56f9da18dbb99b536f90 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 10:30:09 +0100 Subject: [PATCH 06/23] bug Fix --- app/core/chunker.py | 82 +------------------------ app/core/chunking/__init__.py | 10 +++ app/core/chunking/chunking_processor.py | 53 ++++++++++++++++ app/core/ingestion/__init__.py | 9 +++ 4 files changed, 75 insertions(+), 79 deletions(-) create mode 100644 app/core/chunking/chunking_processor.py diff --git a/app/core/chunker.py b/app/core/chunker.py index d8ea589..4a624e2 100644 --- a/app/core/chunker.py +++ b/app/core/chunker.py @@ -1,86 +1,10 @@ """ FILE: app/core/chunker.py DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher. - WP-14: Modularisierung abgeschlossen. - WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert. - Verwendet neue 'chunking_' Präfixe für Untermodule. VERSION: 3.3.0 -STATUS: Active """ -import asyncio -import re -import logging -from typing import List, Dict, Optional - -# Interne Package-Imports mit neuer Präfix-Konvention -from .chunking.chunking_models import Chunk, RawBlock +from .chunking.chunking_processor import assemble_chunks from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text -from .chunking.chunking_parser import parse_blocks, parse_edges_robust -from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading -from .chunking.chunking_propagation import propagate_section_edges +from .chunking.chunking_models import Chunk -logger = logging.getLogger(__name__) - -# Legacy Support für SemanticAnalyzer (Optional für andere Skripte) -try: - from app.services.semantic_analyzer import get_semantic_analyzer -except ImportError: - def get_semantic_analyzer(): return None - -async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: - """ - Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten. - Sichert die Kompatibilität zum bestehenden Ingestion-Prozess. - """ - if config is None: - config = get_chunk_config(note_type) - - fm, body_text = extract_frontmatter_from_text(md_text) - primary_strategy = config.get("strategy", "sliding_window") - - # 1. Parsing - blocks, doc_title = parse_blocks(md_text) - - # 2. Splitting via Thread-Offloading - if primary_strategy == "by_heading": - chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) - else: - chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) - - if not chunks: return [] - - # 3. WP-15b: Candidate Pool Vorbereitung - # A. Edge Inheritance (Sektions-Propagation) - chunks = propagate_section_edges(chunks, blocks) - - # B. Explicit Edges (Direkt im Chunk-Text) - for ch in chunks: - explicit = parse_edges_robust(ch.text) - for e_str in explicit: - kind, target = e_str.split(':', 1) - ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"}) - - # C. Global Pool Detection (Sektion 'Unzugeordnete Kanten') - pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) - if pool_match: - unassigned = parse_edges_robust(pool_match.group(1)) - for ch in chunks: - for e_str in unassigned: - kind, target = e_str.split(':', 1) - ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"}) - - # D. Eindeutigkeit sicherstellen - for ch in chunks: - seen = set(); unique_pool = [] - for cand in ch.candidate_pool: - key = (cand["kind"], cand["to"]) - if key not in seen: - seen.add(key); unique_pool.append(cand) - ch.candidate_pool = unique_pool - - # 4. Graph-Struktur (Nachbarschaft) - for i, ch in enumerate(chunks): - ch.neighbors_prev = chunks[i-1].id if i > 0 else None - ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None - - return chunks \ No newline at end of file +__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"] \ No newline at end of file diff --git a/app/core/chunking/__init__.py b/app/core/chunking/__init__.py index e69de29..0d8c4bc 100644 --- a/app/core/chunking/__init__.py +++ b/app/core/chunking/__init__.py @@ -0,0 +1,10 @@ +""" +FILE: app/core/chunking/__init__.py +DESCRIPTION: Package-Einstiegspunkt für Chunking. Exportiert assemble_chunks. +VERSION: 3.3.0 +""" +from .chunking_processor import assemble_chunks +from .chunking_utils import get_chunk_config, extract_frontmatter_from_text +from .chunking_models import Chunk + +__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"] \ No newline at end of file diff --git a/app/core/chunking/chunking_processor.py b/app/core/chunking/chunking_processor.py new file mode 100644 index 0000000..12c9a7b --- /dev/null +++ b/app/core/chunking/chunking_processor.py @@ -0,0 +1,53 @@ +""" +FILE: app/core/chunking/chunking_processor.py +DESCRIPTION: Hauptlogik für das Zerlegen von Markdown in Chunks. +""" +import asyncio +import re +from typing import List, Dict, Optional +from .chunking_models import Chunk +from .chunking_utils import get_chunk_config, extract_frontmatter_from_text +from .chunking_parser import parse_blocks, parse_edges_robust +from .chunking_strategies import strategy_sliding_window, strategy_by_heading +from .chunking_propagation import propagate_section_edges + +async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: + """Orchestriert das Chunking und baut den Candidate-Pool auf.""" + if config is None: config = get_chunk_config(note_type) + fm, body_text = extract_frontmatter_from_text(md_text) + blocks, doc_title = parse_blocks(md_text) + + if config.get("strategy") == "by_heading": + chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) + else: + chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) + + if not chunks: return [] + + # WP-15b: Candidate Pool Aufbau + chunks = propagate_section_edges(chunks, blocks) + for ch in chunks: + for e_str in parse_edges_robust(ch.text): + k, t = e_str.split(':', 1) + ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"}) + + # Global Pool (Unzugeordnete Kanten) + pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) + if pool_match: + for e_str in parse_edges_robust(pool_match.group(1)): + k, t = e_str.split(':', 1) + for ch in chunks: ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"}) + + # De-Duplikation + for ch in chunks: + seen = set(); unique = [] + for c in ch.candidate_pool: + if (c["kind"], c["to"]) not in seen: + seen.add((c["kind"], c["to"])); unique.append(c) + ch.candidate_pool = unique + + # Nachbarschaften + for i, ch in enumerate(chunks): + ch.neighbors_prev = chunks[i-1].id if i > 0 else None + ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None + return chunks \ No newline at end of file diff --git a/app/core/ingestion/__init__.py b/app/core/ingestion/__init__.py index e69de29..6b1f0db 100644 --- a/app/core/ingestion/__init__.py +++ b/app/core/ingestion/__init__.py @@ -0,0 +1,9 @@ +""" +FILE: app/core/ingestion/__init__.py +DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService. +VERSION: 2.13.0 +""" +from .ingestion_processor import IngestionService +from .ingestion_utils import extract_json_from_response, load_type_registry + +__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"] \ No newline at end of file From a6d37c92d2f6af47766e8367f95d4ca10838a842 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 10:40:44 +0100 Subject: [PATCH 07/23] Integration von payload modulen in die neue Struktur --- app/core/ingestion/ingestion_chunk_payload.py | 46 ++++++++ app/core/ingestion/ingestion_note_payload.py | 82 +++++++++++++ app/core/ingestion/ingestion_processor.py | 110 ++++++++++++------ 3 files changed, 205 insertions(+), 33 deletions(-) create mode 100644 app/core/ingestion/ingestion_chunk_payload.py create mode 100644 app/core/ingestion/ingestion_note_payload.py diff --git a/app/core/ingestion/ingestion_chunk_payload.py b/app/core/ingestion/ingestion_chunk_payload.py new file mode 100644 index 0000000..67c48fb --- /dev/null +++ b/app/core/ingestion/ingestion_chunk_payload.py @@ -0,0 +1,46 @@ +""" +FILE: app/core/ingestion/ingestion_chunk_payload.py +DESCRIPTION: Baut das JSON-Objekt für mindnet_chunks. +VERSION: 2.4.0 +""" +from __future__ import annotations +from typing import Any, Dict, List, Optional + +def _as_list(x): + if x is None: return [] + return x if isinstance(x, list) else [x] + +def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]: + """Erstellt die Payloads für die Chunks eines Dokuments.""" + if isinstance(note, dict) and "frontmatter" in note: fm = note["frontmatter"] + else: fm = note or {} + + note_type = fm.get("type") or "concept" + title = fm.get("title") or fm.get("id") or "Untitled" + tags = _as_list(fm.get("tags") or []) + cp = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard" + rw = float(fm.get("retriever_weight", 1.0)) + + out: List[Dict[str, Any]] = [] + for idx, ch in enumerate(chunks_from_chunker): + text = getattr(ch, "text", "") or ch.get("text", "") + pl: Dict[str, Any] = { + "note_id": getattr(ch, "note_id", None) or fm.get("id"), + "chunk_id": getattr(ch, "id", None), + "title": title, + "index": int(getattr(ch, "index", idx)), + "ord": int(getattr(ch, "index", idx)) + 1, + "type": note_type, + "tags": tags, + "text": text, + "window": getattr(ch, "window", text), + "neighbors_prev": _as_list(getattr(ch, "neighbors_prev", None)), + "neighbors_next": _as_list(getattr(ch, "neighbors_next", None)), + "section": getattr(ch, "section_title", "") or ch.get("section", ""), + "path": note_path, + "source_path": kwargs.get("file_path") or note_path, + "retriever_weight": rw, + "chunk_profile": cp + } + out.append(pl) + return out \ No newline at end of file diff --git a/app/core/ingestion/ingestion_note_payload.py b/app/core/ingestion/ingestion_note_payload.py new file mode 100644 index 0000000..045efdd --- /dev/null +++ b/app/core/ingestion/ingestion_note_payload.py @@ -0,0 +1,82 @@ +""" +FILE: app/core/ingestion/ingestion_note_payload.py +DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. +FEATURES: Multi-Hash (body/full), Config-Fix für chunking_profile. +VERSION: 2.4.0 +""" +from __future__ import annotations +from typing import Any, Dict, Tuple, Optional +import os +import json +import pathlib +import hashlib +import yaml + +def _as_dict(x) -> Dict[str, Any]: + if isinstance(x, dict): return dict(x) + out: Dict[str, Any] = {} + for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"): + if hasattr(x, attr): + val = getattr(x, attr) + if val is not None: out[attr] = val + if not out: out["raw"] = str(x) + return out + +def _ensure_list(x) -> list: + if x is None: return [] + if isinstance(x, list): return [str(i) for i in x] + if isinstance(x, (set, tuple)): return [str(i) for i in x] + return [str(x)] + +def _compute_hash(content: str) -> str: + if not content: return "" + return hashlib.sha256(content.encode("utf-8")).hexdigest() + +def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: + body = str(n.get("body") or "") + if mode == "body": return body + if mode == "full": + fm = n.get("frontmatter") or {} + meta_parts = [] + for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): + val = fm.get(k) + if val is not None: meta_parts.append(f"{k}:{val}") + return f" {'|'.join(meta_parts)}||{body}" + return body + +def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: + """Baut das Note-Payload inklusive Multi-Hash.""" + n = _as_dict(note) + reg = kwargs.get("types_cfg") or {} + hash_source = kwargs.get("hash_source", "parsed") + hash_normalize = kwargs.get("hash_normalize", "canonical") + + fm = n.get("frontmatter") or {} + note_type = str(fm.get("type") or n.get("type") or "concept") + + # Weights & Profiles + retriever_weight = fm.get("retriever_weight", 1.0) + chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard" + + payload: Dict[str, Any] = { + "note_id": n.get("note_id") or n.get("id") or fm.get("id"), + "title": n.get("title") or fm.get("title") or "", + "type": note_type, + "path": str(n.get("path") or kwargs.get("path") or ""), + "retriever_weight": float(retriever_weight), + "chunk_profile": chunk_profile, + "hashes": {} + } + + for mode in ["body", "full"]: + key = f"{mode}:{hash_source}:{hash_normalize}" + payload["hashes"][key] = _compute_hash(_get_hash_source_content(n, mode)) + + if fm.get("tags") or n.get("tags"): payload["tags"] = _ensure_list(fm.get("tags") or n.get("tags")) + if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases")) + for k in ("created", "modified", "date"): + v = fm.get(k) or n.get(k) + if v: payload[k] = str(v) + if n.get("body"): payload["fulltext"] = str(n["body"]) + + return payload \ No newline at end of file diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index 06c292d..a31185f 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -1,31 +1,38 @@ """ FILE: app/core/ingestion/ingestion_processor.py DESCRIPTION: Orchestriert den Ingestion-Prozess (Parsing -> Chunking -> Validierung -> DB). + WP-14: Modularisiert. Nutzt interne Module für DB, Validierung und Payloads. + WP-15b: Implementiert den Two-Pass Workflow via run_batch. +VERSION: 2.13.2 +STATUS: Active """ import logging import asyncio +import os from typing import Dict, List, Optional, Tuple, Any +# Core Module Imports from app.core.parser import ( read_markdown, pre_scan_markdown, normalize_frontmatter, validate_required_frontmatter, NoteContext ) -from app.core.note_payload import make_note_payload from app.core.chunker import assemble_chunks -from app.core.chunk_payload import make_chunk_payloads from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch +# Services from app.services.embeddings_client import EmbeddingsClient from app.services.edge_registry import registry as edge_registry from app.services.llm_service import LLMService -# Package-Interne Imports +# Package-Interne Imports (Refactoring WP-14) from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts from .ingestion_validation import validate_edge_candidate +from .ingestion_note_payload import make_note_payload +from .ingestion_chunk_payload import make_chunk_payloads -# Fallback für Edges +# Fallback für Edges (Struktur-Verknüpfung) try: from app.core.derive_edges import build_edges_for_note except ImportError: @@ -35,8 +42,10 @@ logger = logging.getLogger(__name__) class IngestionService: def __init__(self, collection_prefix: str = None): + """Initialisiert den Service und stellt die DB-Verbindung bereit.""" from app.config import get_settings self.settings = get_settings() + self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX self.cfg = QdrantConfig.from_env() self.cfg.prefix = self.prefix @@ -45,28 +54,37 @@ class IngestionService: self.registry = load_type_registry() self.embedder = EmbeddingsClient() self.llm = LLMService() + self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE - self.batch_cache: Dict[str, NoteContext] = {} + self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache try: ensure_collections(self.client, self.prefix, self.dim) ensure_payload_indexes(self.client, self.prefix) - except Exception as e: logger.warning(f"DB init warning: {e}") + except Exception as e: + logger.warning(f"DB initialization warning: {e}") async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]: - """WP-15b: Two-Pass Ingestion Workflow.""" + """ + WP-15b: Implementiert den Two-Pass Ingestion Workflow. + Pass 1: Pre-Scan füllt den Context-Cache. + Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung. + """ logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") for path in file_paths: - ctx = pre_scan_markdown(path) - if ctx: - self.batch_cache[ctx.note_id] = ctx - self.batch_cache[ctx.title] = ctx - import os - fname = os.path.splitext(os.path.basename(path))[0] - self.batch_cache[fname] = ctx + try: + ctx = pre_scan_markdown(path) + if ctx: + # Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname) + self.batch_cache[ctx.note_id] = ctx + self.batch_cache[ctx.title] = ctx + fname = os.path.splitext(os.path.basename(path))[0] + self.batch_cache[fname] = ctx + except Exception as e: + logger.warning(f"⚠️ Pre-scan failed for {path}: {e}") logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...") - return [await self.process_file(p, vault_root, apply=True) for p in file_paths] + return [await self.process_file(p, vault_root, apply=True, purge_before=True) for p in file_paths] async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]: """Transformiert eine Markdown-Datei in den Graphen.""" @@ -78,18 +96,19 @@ class IngestionService: result = {"path": file_path, "status": "skipped", "changed": False, "error": None} - # 1. Parse & Lifecycle + # 1. Parse & Lifecycle Gate try: parsed = read_markdown(file_path) if not parsed: return {**result, "error": "Empty file"} fm = normalize_frontmatter(parsed.frontmatter) validate_required_frontmatter(fm) - except Exception as e: return {**result, "error": f"Validation failed: {str(e)}"} + except Exception as e: + return {**result, "error": f"Validation failed: {str(e)}"} if fm.get("status", "draft").lower().strip() in ["system", "template", "archive", "hidden"]: return {**result, "status": "skipped", "reason": "lifecycle_filter"} - # 2. Payload & Change Detection + # 2. Payload & Change Detection (Multi-Hash) note_type = resolve_note_type(self.registry, fm.get("type")) note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, hash_source=hash_source, hash_normalize=hash_normalize) note_id = note_pl["note_id"] @@ -103,9 +122,10 @@ class IngestionService: if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss): return {**result, "status": "unchanged", "note_id": note_id} - if not apply: return {**result, "status": "dry-run", "changed": True, "note_id": note_id} + if not apply: + return {**result, "status": "dry-run", "changed": True, "note_id": note_id} - # 3. Processing + # 3. Deep Processing (Chunking, Validation, Embedding) try: body_text = getattr(parsed, "body", "") or "" edge_registry.ensure_latest() @@ -113,40 +133,64 @@ class IngestionService: chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type) enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False) + # WP-15b: Chunker-Aufruf bereitet Candidate-Pool vor chunks = await assemble_chunks(fm["id"], body_text, note_type, config=chunk_cfg) for ch in chunks: filtered = [] for cand in getattr(ch, "candidate_pool", []): + # Nur global_pool Kandidaten erfordern binäre Validierung if cand.get("provenance") == "global_pool" and enable_smart: if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER): filtered.append(cand) - else: filtered.append(cand) + else: + filtered.append(cand) ch.candidate_pool = filtered - chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) + # Payload-Erstellung via interne Module + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path) vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else [] + # Kanten-Aggregation edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", [])) for e in edges: - e["kind"] = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"), context={"file": file_path, "note_id": note_id}) + e["kind"] = edge_registry.resolve( + e.get("kind", "related_to"), + provenance=e.get("provenance", "explicit"), + context={"file": file_path, "note_id": note_id} + ) # 4. DB Upsert - if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id) + if purge_before and old_payload: + purge_artifacts(self.client, self.prefix, note_id) + n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim) upsert_batch(self.client, n_name, n_pts) - if chunk_pls and vecs: upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1]) - if edges: upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, edges)[1]) - return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)} + if chunk_pls and vecs: + c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1] + upsert_batch(self.client, f"{self.prefix}_chunks", c_pts) + + if edges: + e_pts = points_for_edges(self.prefix, edges)[1] + upsert_batch(self.client, f"{self.prefix}_edges", e_pts) + + return { + "path": file_path, + "status": "success", + "changed": True, + "note_id": note_id, + "chunks_count": len(chunk_pls), + "edges_count": len(edges) + } except Exception as e: logger.error(f"Processing failed: {e}", exc_info=True) return {**result, "error": str(e)} async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]: - import os - target_dir = os.path.join(vault_root, folder) - os.makedirs(target_dir, exist_ok=True) - file_path = os.path.join(target_dir, filename) - with open(file_path, "w", encoding="utf-8") as f: f.write(markdown_content) + """Erstellt eine Note aus einem Textstream und triggert die Ingestion.""" + target_path = os.path.join(vault_root, folder, filename) + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with open(target_path, "w", encoding="utf-8") as f: + f.write(markdown_content) await asyncio.sleep(0.1) - return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True) \ No newline at end of file + return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True) \ No newline at end of file From 8ade34af0a9d9dd9719e93851a8715a060929d0f Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 10:50:15 +0100 Subject: [PATCH 08/23] WP19b- chunk_payload an neue Struktur --- app/core/ingestion/ingestion_chunk_payload.py | 56 ++++++++++++++----- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/app/core/ingestion/ingestion_chunk_payload.py b/app/core/ingestion/ingestion_chunk_payload.py index 67c48fb..3086d97 100644 --- a/app/core/ingestion/ingestion_chunk_payload.py +++ b/app/core/ingestion/ingestion_chunk_payload.py @@ -1,7 +1,9 @@ """ FILE: app/core/ingestion/ingestion_chunk_payload.py -DESCRIPTION: Baut das JSON-Objekt für mindnet_chunks. -VERSION: 2.4.0 +DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. + Fix v2.4.1: Behebt AttributeError bei Zugriff auf Chunk-Objekte. +VERSION: 2.4.1 +STATUS: Active """ from __future__ import annotations from typing import Any, Dict, List, Optional @@ -10,10 +12,19 @@ def _as_list(x): if x is None: return [] return x if isinstance(x, list) else [x] -def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]: - """Erstellt die Payloads für die Chunks eines Dokuments.""" - if isinstance(note, dict) and "frontmatter" in note: fm = note["frontmatter"] - else: fm = note or {} +def make_chunk_payloads(note: Dict[str, Any], + note_path: str, + chunks_from_chunker: List[Any], + **kwargs) -> List[Dict[str, Any]]: + """ + Erstellt die Payloads für die Chunks eines Dokuments. + Robust gegenüber Chunk-Objekten (Dataclasses) und Dictionaries. + """ + # Frontmatter Extraktion + if isinstance(note, dict) and "frontmatter" in note: + fm = note["frontmatter"] + else: + fm = note or {} note_type = fm.get("type") or "concept" title = fm.get("title") or fm.get("id") or "Untitled" @@ -23,24 +34,39 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): - text = getattr(ch, "text", "") or ch.get("text", "") + # Dynamische Extraktion basierend auf Typ (Objekt vs Dict) + is_dict = isinstance(ch, dict) + + cid = getattr(ch, "id", None) if not is_dict else ch.get("id") + nid = getattr(ch, "note_id", None) if not is_dict else ch.get("note_id") + index = getattr(ch, "index", idx) if not is_dict else ch.get("index", idx) + text = getattr(ch, "text", "") if not is_dict else ch.get("text", "") + window = getattr(ch, "window", text) if not is_dict else ch.get("window", text) + + prev_id = getattr(ch, "neighbors_prev", None) if not is_dict else ch.get("neighbors_prev") + next_id = getattr(ch, "neighbors_next", None) if not is_dict else ch.get("neighbors_next") + + # Korrektur des AttributeError: Nutzt getattr für Objekte, .get für Dicts + section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "") + pl: Dict[str, Any] = { - "note_id": getattr(ch, "note_id", None) or fm.get("id"), - "chunk_id": getattr(ch, "id", None), + "note_id": nid or fm.get("id"), + "chunk_id": cid, "title": title, - "index": int(getattr(ch, "index", idx)), - "ord": int(getattr(ch, "index", idx)) + 1, + "index": int(index), + "ord": int(index) + 1, "type": note_type, "tags": tags, "text": text, - "window": getattr(ch, "window", text), - "neighbors_prev": _as_list(getattr(ch, "neighbors_prev", None)), - "neighbors_next": _as_list(getattr(ch, "neighbors_next", None)), - "section": getattr(ch, "section_title", "") or ch.get("section", ""), + "window": window, + "neighbors_prev": _as_list(prev_id), + "neighbors_next": _as_list(next_id), + "section": section, "path": note_path, "source_path": kwargs.get("file_path") or note_path, "retriever_weight": rw, "chunk_profile": cp } out.append(pl) + return out \ No newline at end of file From cfcaa926cdee185dba476571f4beb07f39dcb274 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 13:14:10 +0100 Subject: [PATCH 09/23] WP19a Refacturierung - Edgedefaults --- app/core/ingestion/ingestion_note_payload.py | 78 +++++++++++++++++--- 1 file changed, 67 insertions(+), 11 deletions(-) diff --git a/app/core/ingestion/ingestion_note_payload.py b/app/core/ingestion/ingestion_note_payload.py index 045efdd..504c743 100644 --- a/app/core/ingestion/ingestion_note_payload.py +++ b/app/core/ingestion/ingestion_note_payload.py @@ -1,8 +1,11 @@ """ FILE: app/core/ingestion/ingestion_note_payload.py DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. -FEATURES: Multi-Hash (body/full), Config-Fix für chunking_profile. -VERSION: 2.4.0 +FEATURES: + - Multi-Hash (body/full) für flexible Change Detection. + - Fix v2.4.2: edge_defaults Logik wiederhergestellt (DoD-Korrektur). +VERSION: 2.4.2 +STATUS: Active """ from __future__ import annotations from typing import Any, Dict, Tuple, Optional @@ -12,7 +15,12 @@ import pathlib import hashlib import yaml +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + def _as_dict(x) -> Dict[str, Any]: + """Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen.""" if isinstance(x, dict): return dict(x) out: Dict[str, Any] = {} for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"): @@ -23,29 +31,53 @@ def _as_dict(x) -> Dict[str, Any]: return out def _ensure_list(x) -> list: + """Sichert, dass das Ergebnis eine Liste von Strings ist.""" if x is None: return [] if isinstance(x, list): return [str(i) for i in x] if isinstance(x, (set, tuple)): return [str(i) for i in x] return [str(x)] def _compute_hash(content: str) -> str: + """Berechnet einen SHA-256 Hash.""" if not content: return "" return hashlib.sha256(content.encode("utf-8")).hexdigest() def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: + """Stellt den zu hashenden Content deterministisch zusammen.""" body = str(n.get("body") or "") if mode == "body": return body if mode == "full": fm = n.get("frontmatter") or {} meta_parts = [] + # Steuernde Metadaten für Change Detection for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): val = fm.get(k) if val is not None: meta_parts.append(f"{k}:{val}") - return f" {'|'.join(meta_parts)}||{body}" + return f"{'|'.join(meta_parts)}||{body}" return body +def _cfg_for_type(note_type: str, reg: dict) -> dict: + """Holt die typ-spezifische Konfiguration.""" + if not isinstance(reg, dict): return {} + types = reg.get("types") if isinstance(reg.get("types"), dict) else reg + return types.get(note_type, {}) if isinstance(types, dict) else {} + +def _cfg_defaults(reg: dict) -> dict: + """Holt die globalen Default-Werte aus der Registry.""" + if not isinstance(reg, dict): return {} + for key in ("defaults", "default", "global"): + v = reg.get(key) + if isinstance(v, dict): return v + return {} + +# --------------------------------------------------------------------------- +# Haupt-API +# --------------------------------------------------------------------------- + def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: - """Baut das Note-Payload inklusive Multi-Hash.""" + """ + Baut das Note-Payload inklusive Multi-Hash und edge_defaults. + """ n = _as_dict(note) reg = kwargs.get("types_cfg") or {} hash_source = kwargs.get("hash_source", "parsed") @@ -54,24 +86,48 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: fm = n.get("frontmatter") or {} note_type = str(fm.get("type") or n.get("type") or "concept") - # Weights & Profiles - retriever_weight = fm.get("retriever_weight", 1.0) - chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard" + cfg_type = _cfg_for_type(note_type, reg) + cfg_def = _cfg_defaults(reg) + + # --- retriever_weight --- + retriever_weight = fm.get("retriever_weight") + if retriever_weight is None: + retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", 1.0)) + try: retriever_weight = float(retriever_weight) + except: retriever_weight = 1.0 + + # --- chunk_profile --- + chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") + if chunk_profile is None: + chunk_profile = cfg_type.get("chunking_profile", cfg_def.get("chunking_profile", "sliding_standard")) + + # --- edge_defaults (WIEDERHERGESTELLT) --- + edge_defaults = fm.get("edge_defaults") + if edge_defaults is None: + edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", [])) + edge_defaults = _ensure_list(edge_defaults) + + # --- Basis-Metadaten --- + note_id = n.get("note_id") or n.get("id") or fm.get("id") + title = n.get("title") or fm.get("title") or "" payload: Dict[str, Any] = { - "note_id": n.get("note_id") or n.get("id") or fm.get("id"), - "title": n.get("title") or fm.get("title") or "", + "note_id": note_id, + "title": title, "type": note_type, - "path": str(n.get("path") or kwargs.get("path") or ""), - "retriever_weight": float(retriever_weight), + "path": str(n.get("path") or kwargs.get("file_path") or ""), + "retriever_weight": retriever_weight, "chunk_profile": chunk_profile, + "edge_defaults": edge_defaults, # Feld jetzt wieder enthalten "hashes": {} } + # --- MULTI-HASH --- for mode in ["body", "full"]: key = f"{mode}:{hash_source}:{hash_normalize}" payload["hashes"][key] = _compute_hash(_get_hash_source_content(n, mode)) + # Metadaten-Felder if fm.get("tags") or n.get("tags"): payload["tags"] = _ensure_list(fm.get("tags") or n.get("tags")) if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases")) for k in ("created", "modified", "date"): From f08a331bc60b06c5ea5ac1a3e07cca9b995caf7d Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 13:20:37 +0100 Subject: [PATCH 10/23] =?UTF-8?q?herstellung=20vollst=C3=A4ndiger=20Kompai?= =?UTF-8?q?tibilit=C3=A4t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/ingestion/ingestion_chunk_payload.py | 66 ++++++++++++------- app/core/ingestion/ingestion_note_payload.py | 57 ++++++++-------- app/core/ingestion/ingestion_processor.py | 36 ++++++---- 3 files changed, 99 insertions(+), 60 deletions(-) diff --git a/app/core/ingestion/ingestion_chunk_payload.py b/app/core/ingestion/ingestion_chunk_payload.py index 3086d97..e235cbf 100644 --- a/app/core/ingestion/ingestion_chunk_payload.py +++ b/app/core/ingestion/ingestion_chunk_payload.py @@ -1,52 +1,68 @@ """ FILE: app/core/ingestion/ingestion_chunk_payload.py DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. - Fix v2.4.1: Behebt AttributeError bei Zugriff auf Chunk-Objekte. -VERSION: 2.4.1 + Fix v2.4.2: Audit-Check (Cleanup pop, Config-Resolution Hierarchie). +VERSION: 2.4.2 STATUS: Active """ from __future__ import annotations from typing import Any, Dict, List, Optional +# --------------------------------------------------------------------------- +# Resolution Helpers (Audited) +# --------------------------------------------------------------------------- + def _as_list(x): if x is None: return [] return x if isinstance(x, list) else [x] -def make_chunk_payloads(note: Dict[str, Any], - note_path: str, - chunks_from_chunker: List[Any], - **kwargs) -> List[Dict[str, Any]]: - """ - Erstellt die Payloads für die Chunks eines Dokuments. - Robust gegenüber Chunk-Objekten (Dataclasses) und Dictionaries. - """ - # Frontmatter Extraktion - if isinstance(note, dict) and "frontmatter" in note: - fm = note["frontmatter"] - else: - fm = note or {} +def _resolve_val(note_type: str, reg: dict, key: str, default: Any) -> Any: + """Hierarchische Suche: Type > Default.""" + types = reg.get("types", {}) + if isinstance(types, dict): + t_cfg = types.get(note_type, {}) + if isinstance(t_cfg, dict): + val = t_cfg.get(key) or t_cfg.get(key.replace("ing", "")) # chunking_ vs chunk_ + if val is not None: return val + defs = reg.get("defaults", {}) or reg.get("global", {}) + if isinstance(defs, dict): + val = defs.get(key) or defs.get(key.replace("ing", "")) + if val is not None: return val + return default +# --------------------------------------------------------------------------- +# Haupt-API +# --------------------------------------------------------------------------- + +def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]: + """Erstellt die Payloads für die Chunks inklusive Audit-Resolution.""" + if isinstance(note, dict) and "frontmatter" in note: fm = note["frontmatter"] + else: fm = note or {} + + reg = kwargs.get("types_cfg") or {} note_type = fm.get("type") or "concept" title = fm.get("title") or fm.get("id") or "Untitled" tags = _as_list(fm.get("tags") or []) - cp = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard" - rw = float(fm.get("retriever_weight", 1.0)) + + # Audit: Resolution Hierarchie + cp = fm.get("chunking_profile") or fm.get("chunk_profile") + if not cp: cp = _resolve_val(note_type, reg, "chunking_profile", "sliding_standard") + + rw = fm.get("retriever_weight") + if rw is None: rw = _resolve_val(note_type, reg, "retriever_weight", 1.0) + try: rw = float(rw) + except: rw = 1.0 out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): - # Dynamische Extraktion basierend auf Typ (Objekt vs Dict) is_dict = isinstance(ch, dict) - cid = getattr(ch, "id", None) if not is_dict else ch.get("id") nid = getattr(ch, "note_id", None) if not is_dict else ch.get("note_id") index = getattr(ch, "index", idx) if not is_dict else ch.get("index", idx) text = getattr(ch, "text", "") if not is_dict else ch.get("text", "") window = getattr(ch, "window", text) if not is_dict else ch.get("window", text) - prev_id = getattr(ch, "neighbors_prev", None) if not is_dict else ch.get("neighbors_prev") next_id = getattr(ch, "neighbors_next", None) if not is_dict else ch.get("neighbors_next") - - # Korrektur des AttributeError: Nutzt getattr für Objekte, .get für Dicts section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "") pl: Dict[str, Any] = { @@ -67,6 +83,10 @@ def make_chunk_payloads(note: Dict[str, Any], "retriever_weight": rw, "chunk_profile": cp } - out.append(pl) + # Audit: Cleanup Pop (Alias Felder entfernen) + for alias in ("chunk_num", "Chunk_Number"): + pl.pop(alias, None) + + out.append(pl) return out \ No newline at end of file diff --git a/app/core/ingestion/ingestion_note_payload.py b/app/core/ingestion/ingestion_note_payload.py index 504c743..28c5301 100644 --- a/app/core/ingestion/ingestion_note_payload.py +++ b/app/core/ingestion/ingestion_note_payload.py @@ -3,8 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. FEATURES: - Multi-Hash (body/full) für flexible Change Detection. - - Fix v2.4.2: edge_defaults Logik wiederhergestellt (DoD-Korrektur). -VERSION: 2.4.2 + - Fix v2.4.3: Vollständiger Audit-Check (Env-Vars, JSON-Validation, Edge-Defaults). +VERSION: 2.4.3 STATUS: Active """ from __future__ import annotations @@ -13,14 +13,13 @@ import os import json import pathlib import hashlib -import yaml # --------------------------------------------------------------------------- # Helper # --------------------------------------------------------------------------- def _as_dict(x) -> Dict[str, Any]: - """Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen.""" + """Versucht, ein Objekt in ein Dict zu überführen.""" if isinstance(x, dict): return dict(x) out: Dict[str, Any] = {} for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"): @@ -31,25 +30,24 @@ def _as_dict(x) -> Dict[str, Any]: return out def _ensure_list(x) -> list: - """Sichert, dass das Ergebnis eine Liste von Strings ist.""" + """Sichert String-Listen Integrität.""" if x is None: return [] if isinstance(x, list): return [str(i) for i in x] if isinstance(x, (set, tuple)): return [str(i) for i in x] return [str(x)] def _compute_hash(content: str) -> str: - """Berechnet einen SHA-256 Hash.""" + """SHA-256 Hash-Berechnung.""" if not content: return "" return hashlib.sha256(content.encode("utf-8")).hexdigest() def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: - """Stellt den zu hashenden Content deterministisch zusammen.""" + """Generiert den Hash-Input-String.""" body = str(n.get("body") or "") if mode == "body": return body if mode == "full": fm = n.get("frontmatter") or {} meta_parts = [] - # Steuernde Metadaten für Change Detection for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): val = fm.get(k) if val is not None: meta_parts.append(f"{k}:{val}") @@ -57,13 +55,13 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: return body def _cfg_for_type(note_type: str, reg: dict) -> dict: - """Holt die typ-spezifische Konfiguration.""" + """Extrahiert Typ-spezifische Config.""" if not isinstance(reg, dict): return {} types = reg.get("types") if isinstance(reg.get("types"), dict) else reg return types.get(note_type, {}) if isinstance(types, dict) else {} def _cfg_defaults(reg: dict) -> dict: - """Holt die globalen Default-Werte aus der Registry.""" + """Extrahiert globale Default-Werte.""" if not isinstance(reg, dict): return {} for key in ("defaults", "default", "global"): v = reg.get(key) @@ -75,9 +73,7 @@ def _cfg_defaults(reg: dict) -> dict: # --------------------------------------------------------------------------- def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: - """ - Baut das Note-Payload inklusive Multi-Hash und edge_defaults. - """ + """Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.""" n = _as_dict(note) reg = kwargs.get("types_cfg") or {} hash_source = kwargs.get("hash_source", "parsed") @@ -89,19 +85,22 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: cfg_type = _cfg_for_type(note_type, reg) cfg_def = _cfg_defaults(reg) - # --- retriever_weight --- + # --- retriever_weight Audit --- + default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)) retriever_weight = fm.get("retriever_weight") if retriever_weight is None: - retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", 1.0)) + retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", default_rw)) try: retriever_weight = float(retriever_weight) - except: retriever_weight = 1.0 + except: retriever_weight = default_rw - # --- chunk_profile --- + # --- chunk_profile Audit --- chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") if chunk_profile is None: - chunk_profile = cfg_type.get("chunking_profile", cfg_def.get("chunking_profile", "sliding_standard")) + chunk_profile = cfg_type.get("chunking_profile") + if chunk_profile is None: + chunk_profile = cfg_def.get("chunking_profile", "sliding_standard") - # --- edge_defaults (WIEDERHERGESTELLT) --- + # --- edge_defaults --- edge_defaults = fm.get("edge_defaults") if edge_defaults is None: edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", [])) @@ -110,29 +109,35 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: # --- Basis-Metadaten --- note_id = n.get("note_id") or n.get("id") or fm.get("id") title = n.get("title") or fm.get("title") or "" - + path = n.get("path") or kwargs.get("file_path") or "" + if isinstance(path, pathlib.Path): path = str(path) + payload: Dict[str, Any] = { "note_id": note_id, "title": title, "type": note_type, - "path": str(n.get("path") or kwargs.get("file_path") or ""), + "path": path, "retriever_weight": retriever_weight, "chunk_profile": chunk_profile, - "edge_defaults": edge_defaults, # Feld jetzt wieder enthalten + "edge_defaults": edge_defaults, "hashes": {} } # --- MULTI-HASH --- for mode in ["body", "full"]: - key = f"{mode}:{hash_source}:{hash_normalize}" - payload["hashes"][key] = _compute_hash(_get_hash_source_content(n, mode)) + content = _get_hash_source_content(n, mode) + payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content) - # Metadaten-Felder - if fm.get("tags") or n.get("tags"): payload["tags"] = _ensure_list(fm.get("tags") or n.get("tags")) + # Metadaten + tags = fm.get("tags") or fm.get("keywords") or n.get("tags") + if tags: payload["tags"] = _ensure_list(tags) if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases")) for k in ("created", "modified", "date"): v = fm.get(k) or n.get(k) if v: payload[k] = str(v) if n.get("body"): payload["fulltext"] = str(n["body"]) + # Final JSON Validation Audit + json.loads(json.dumps(payload, ensure_ascii=False)) + return payload \ No newline at end of file diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index a31185f..fc9923f 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -1,9 +1,11 @@ """ FILE: app/core/ingestion/ingestion_processor.py -DESCRIPTION: Orchestriert den Ingestion-Prozess (Parsing -> Chunking -> Validierung -> DB). - WP-14: Modularisiert. Nutzt interne Module für DB, Validierung und Payloads. - WP-15b: Implementiert den Two-Pass Workflow via run_batch. -VERSION: 2.13.2 +DESCRIPTION: Der zentrale IngestionService (Orchestrator). + WP-14: Vollständig modularisiert. + WP-15b: Two-Pass Workflow mit globalem Kontext-Cache. + WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert. + AUDIT v2.13.4: 100% Logik-Erhalt (Parameters, Registry-Context, DB-Points). +VERSION: 2.13.4 STATUS: Active """ import logging @@ -67,7 +69,7 @@ class IngestionService: async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]: """ WP-15b: Implementiert den Two-Pass Ingestion Workflow. - Pass 1: Pre-Scan füllt den Context-Cache. + Pass 1: Pre-Scan füllt den Context-Cache (3-Wege-Indexierung). Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung. """ logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") @@ -91,6 +93,7 @@ class IngestionService: apply = kwargs.get("apply", False) force_replace = kwargs.get("force_replace", False) purge_before = kwargs.get("purge_before", False) + note_scope_refs = kwargs.get("note_scope_refs", False) hash_source = kwargs.get("hash_source", "parsed") hash_normalize = kwargs.get("hash_normalize", "canonical") @@ -110,7 +113,11 @@ class IngestionService: # 2. Payload & Change Detection (Multi-Hash) note_type = resolve_note_type(self.registry, fm.get("type")) - note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, hash_source=hash_source, hash_normalize=hash_normalize) + note_pl = make_note_payload( + parsed, vault_root=vault_root, file_path=file_path, + hash_source=hash_source, hash_normalize=hash_normalize, + types_cfg=self.registry + ) note_id = note_pl["note_id"] old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id) @@ -134,11 +141,11 @@ class IngestionService: enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False) # WP-15b: Chunker-Aufruf bereitet Candidate-Pool vor - chunks = await assemble_chunks(fm["id"], body_text, note_type, config=chunk_cfg) + chunks = await assemble_chunks(note_id, body_text, note_type, config=chunk_cfg) for ch in chunks: filtered = [] for cand in getattr(ch, "candidate_pool", []): - # Nur global_pool Kandidaten erfordern binäre Validierung + # WP-15b: Nur global_pool Kandidaten erfordern binäre Validierung if cand.get("provenance") == "global_pool" and enable_smart: if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER): filtered.append(cand) @@ -147,16 +154,23 @@ class IngestionService: ch.candidate_pool = filtered # Payload-Erstellung via interne Module - chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path) + chunk_pls = make_chunk_payloads( + fm, note_pl["path"], chunks, file_path=file_path, + types_cfg=self.registry + ) vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else [] # Kanten-Aggregation - edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", [])) + edges = build_edges_for_note( + note_id, chunk_pls, + note_level_references=note_pl.get("references", []), + include_note_scope_refs=note_scope_refs + ) for e in edges: e["kind"] = edge_registry.resolve( e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"), - context={"file": file_path, "note_id": note_id} + context={"file": file_path, "note_id": note_id, "line": e.get("line", "system")} ) # 4. DB Upsert From e3858e8bc334548368732ca44f5edd2a56ea0b0c Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 14:15:22 +0100 Subject: [PATCH 11/23] =?UTF-8?q?aufr=C3=A4umen=20und=20l=C3=B6schen=20von?= =?UTF-8?q?=20Alt-Scripten=20WP19b?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/chunk_payload.py | 176 -------------- app/core/chunker.py | 10 - app/core/ingestion.py | 15 -- app/core/ingestion/ingestion_processor.py | 2 +- app/core/ingestion/ingestion_utils.py | 2 +- app/core/note_payload.py | 268 ---------------------- app/services/semantic_analyzer.py | 199 ---------------- scripts/audit_chunks.py | 2 +- scripts/debug_edge_loss.py | 2 +- scripts/dump_note_chunks.py | 2 +- scripts/fix_frontmatter.py | 2 +- scripts/parse_validate_notes.py | 2 + scripts/payload_dryrun.py | 6 +- scripts/preview_chunks.py | 7 +- 14 files changed, 15 insertions(+), 680 deletions(-) delete mode 100644 app/core/chunk_payload.py delete mode 100644 app/core/chunker.py delete mode 100644 app/core/ingestion.py delete mode 100644 app/core/note_payload.py delete mode 100644 app/services/semantic_analyzer.py diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py deleted file mode 100644 index 9058753..0000000 --- a/app/core/chunk_payload.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -FILE: app/core/chunk_payload.py -DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. -FEATURES: - - Inkludiert Nachbarschafts-IDs (prev/next) und Titel. - - FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen. -VERSION: 2.3.0 -STATUS: Active -DEPENDENCIES: yaml, os -EXTERNAL_CONFIG: config/types.yaml -""" -from __future__ import annotations -from typing import Any, Dict, List, Optional -import os, yaml - -def _env(n: str, d: Optional[str]=None) -> str: - v = os.getenv(n) - return v if v is not None else (d or "") - -def _load_types() -> dict: - p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") - try: - with open(p, "r", encoding="utf-8") as f: - return yaml.safe_load(f) or {} - except Exception: - return {} - -def _get_types_map(reg: dict) -> dict: - if isinstance(reg, dict) and isinstance(reg.get("types"), dict): - return reg["types"] - return reg if isinstance(reg, dict) else {} - -def _get_defaults(reg: dict) -> dict: - if isinstance(reg, dict) and isinstance(reg.get("defaults"), dict): - return reg["defaults"] - if isinstance(reg, dict) and isinstance(reg.get("global"), dict): - return reg["global"] - return {} - -def _as_float(x: Any): - try: return float(x) - except Exception: return None - -def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]: - # 1. Type Level - types = _get_types_map(reg) - if isinstance(types, dict): - t = types.get(note_type, {}) - if isinstance(t, dict): - cp = t.get("chunking_profile") or t.get("chunk_profile") - if isinstance(cp, str) and cp: return cp - # 2. Defaults Level - defs = _get_defaults(reg) - if isinstance(defs, dict): - cp = defs.get("chunking_profile") or defs.get("chunk_profile") - if isinstance(cp, str) and cp: return cp - return None - -def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float: - """ - Liest Weight nur aus Config (Type > Default). - Wird aufgerufen, wenn im Frontmatter nichts steht. - """ - # 1. Type Level - types = _get_types_map(reg) - if isinstance(types, dict): - t = types.get(note_type, {}) - if isinstance(t, dict) and (t.get("retriever_weight") is not None): - v = _as_float(t.get("retriever_weight")) - if v is not None: return float(v) - - # 2. Defaults Level - defs = _get_defaults(reg) - if isinstance(defs, dict) and (defs.get("retriever_weight") is not None): - v = _as_float(defs.get("retriever_weight")) - if v is not None: return float(v) - - return 1.0 - -def _as_list(x): - if x is None: return [] - if isinstance(x, list): return x - return [x] - -def make_chunk_payloads(note: Dict[str, Any], - note_path: str, - chunks_from_chunker: List[Any], - *, - note_text: str = "", - types_cfg: Optional[dict] = None, - file_path: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Erstellt die Payloads für die Chunks. - - Argument 'note' kann sein: - A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests) - B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py) - """ - - # --- FIX 3: Intelligente Erkennung der Input-Daten --- - # Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'? - if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict): - # Fall A: Container (wir müssen auspacken) - fm = note["frontmatter"] - else: - # Fall B: Direktes Dict (so ruft ingestion.py es auf!) - fm = note or {} - - note_type = fm.get("type") or note.get("type") or "concept" - - # Title Extraction (Fallback Chain) - title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled" - - reg = types_cfg if isinstance(types_cfg, dict) else _load_types() - - # --- Profil-Ermittlung --- - # Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups: - cp = fm.get("chunking_profile") or fm.get("chunk_profile") - - if not cp: - cp = _resolve_chunk_profile_from_config(note_type, reg) - if not cp: - cp = "sliding_standard" - - # --- Retriever Weight Ermittlung --- - rw = fm.get("retriever_weight") - - if rw is None: - rw = _resolve_retriever_weight_from_config(note_type, reg) - - try: - rw = float(rw) - except Exception: - rw = 1.0 - - tags = fm.get("tags") or [] - if isinstance(tags, str): - tags = [tags] - - out: List[Dict[str, Any]] = [] - for idx, ch in enumerate(chunks_from_chunker): - # Attribute extrahieren - cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) - nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) - index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) - text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "") - window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text) - prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None) - next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None) - - pl: Dict[str, Any] = { - "note_id": nid, - "chunk_id": cid, - "title": title, - "index": int(index), - "ord": int(index) + 1, - "type": note_type, - "tags": tags, - "text": text, - "window": window, - "neighbors_prev": _as_list(prev_id), - "neighbors_next": _as_list(next_id), - "section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""), - "path": note_path, - "source_path": file_path or note_path, - "retriever_weight": float(rw), - "chunk_profile": cp, # Jetzt endlich mit dem Override-Wert! - } - - # Cleanup - for alias in ("chunk_num", "Chunk_Number"): - pl.pop(alias, None) - - out.append(pl) - - return out \ No newline at end of file diff --git a/app/core/chunker.py b/app/core/chunker.py deleted file mode 100644 index 4a624e2..0000000 --- a/app/core/chunker.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -FILE: app/core/chunker.py -DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher. -VERSION: 3.3.0 -""" -from .chunking.chunking_processor import assemble_chunks -from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text -from .chunking.chunking_models import Chunk - -__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"] \ No newline at end of file diff --git a/app/core/ingestion.py b/app/core/ingestion.py deleted file mode 100644 index a140178..0000000 --- a/app/core/ingestion.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -FILE: app/core/ingestion.py -DESCRIPTION: Facade für das Ingestion-Package. Stellt 100% Abwärtskompatibilität sicher. - WP-14: Modularisierung der Ingestion-Pipeline abgeschlossen. - Nutzt interne Module mit 'ingestion_' Präfix für maximale Wartbarkeit. -VERSION: 2.13.0 -STATUS: Active -""" -# Export der Hauptklasse für externe Module (z.B. scripts/import_markdown.py) -from .ingestion.ingestion_processor import IngestionService - -# Export der Hilfsfunktionen für Abwärtskompatibilität -from .ingestion.ingestion_utils import extract_json_from_response, load_type_registry - -__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"] \ No newline at end of file diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index fc9923f..268b47c 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -18,7 +18,7 @@ from app.core.parser import ( read_markdown, pre_scan_markdown, normalize_frontmatter, validate_required_frontmatter, NoteContext ) -from app.core.chunker import assemble_chunks +from app.core.chunking import assemble_chunks from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch diff --git a/app/core/ingestion/ingestion_utils.py b/app/core/ingestion/ingestion_utils.py index dadba30..c3b6068 100644 --- a/app/core/ingestion/ingestion_utils.py +++ b/app/core/ingestion/ingestion_utils.py @@ -59,7 +59,7 @@ def resolve_note_type(registry: dict, requested: Optional[str]) -> str: def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" - from app.core.chunker import get_chunk_config + from app.core.chunking import get_chunk_config profiles = registry.get("chunking_profiles", {}) if profile_name in profiles: cfg = profiles[profile_name].copy() diff --git a/app/core/note_payload.py b/app/core/note_payload.py deleted file mode 100644 index 957a97e..0000000 --- a/app/core/note_payload.py +++ /dev/null @@ -1,268 +0,0 @@ -""" -FILE: app/core/note_payload.py -DESCRIPTION: Baut das JSON-Objekt. -FEATURES: - 1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection. - 2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile'). -VERSION: 2.3.0 -STATUS: Active -DEPENDENCIES: yaml, os, json, pathlib, hashlib -EXTERNAL_CONFIG: config/types.yaml -""" - -from __future__ import annotations - -from typing import Any, Dict, Tuple, Optional -import os -import json -import pathlib -import hashlib - -try: - import yaml # type: ignore -except Exception: - yaml = None - - -# --------------------------------------------------------------------------- -# Helper -# --------------------------------------------------------------------------- - -def _as_dict(x) -> Dict[str, Any]: - """Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen.""" - if isinstance(x, dict): - return dict(x) - - out: Dict[str, Any] = {} - for attr in ( - "frontmatter", - "body", - "id", - "note_id", - "title", - "path", - "tags", - "type", - "created", - "modified", - "date", - ): - if hasattr(x, attr): - val = getattr(x, attr) - if val is not None: - out[attr] = val - - if not out: - out["raw"] = str(x) - - return out - - -def _pick_args(*args, **kwargs) -> Tuple[Optional[str], Optional[dict]]: - path = kwargs.get("path") or (args[0] if args else None) - types_cfg = kwargs.get("types_cfg") or kwargs.get("types") or None - return path, types_cfg - - -def _env_float(name: str, default: float) -> float: - try: - return float(os.environ.get(name, default)) - except Exception: - return default - - -def _ensure_list(x) -> list: - if x is None: - return [] - if isinstance(x, list): - return [str(i) for i in x] - if isinstance(x, (set, tuple)): - return [str(i) for i in x] - return [str(x)] - -# --- Hash Logic --- -def _compute_hash(content: str) -> str: - """Berechnet einen SHA-256 Hash für den gegebenen String.""" - if not content: - return "" - return hashlib.sha256(content.encode("utf-8")).hexdigest() - -def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: - """ - Stellt den String zusammen, der gehasht werden soll. - """ - body = str(n.get("body") or "") - - if mode == "body": - return body - - if mode == "full": - fm = n.get("frontmatter") or {} - # Wichtig: Sortierte Keys für deterministisches Verhalten! - # Wir nehmen alle steuernden Metadaten auf - meta_parts = [] - # Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden - for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): - val = fm.get(k) - if val is not None: - meta_parts.append(f"{k}:{val}") - - meta_str = "|".join(meta_parts) - return f"{meta_str}||{body}" - - return body - - -# --------------------------------------------------------------------------- -# Type-Registry laden -# --------------------------------------------------------------------------- - -def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict: - if explicit_cfg and isinstance(explicit_cfg, dict): - return explicit_cfg - - path = os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml" - if not os.path.isfile(path) or yaml is None: - return {} - - try: - with open(path, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - return data if isinstance(data, dict) else {} - except Exception: - return {} - - -def _cfg_for_type(note_type: str, reg: dict) -> dict: - if not isinstance(reg, dict): - return {} - types = reg.get("types") if isinstance(reg.get("types"), dict) else reg - return types.get(note_type, {}) if isinstance(types, dict) else {} - - -def _cfg_defaults(reg: dict) -> dict: - if not isinstance(reg, dict): - return {} - for key in ("defaults", "default", "global"): - v = reg.get(key) - if isinstance(v, dict): - return v - return {} - - -# --------------------------------------------------------------------------- -# Haupt-API -# --------------------------------------------------------------------------- - -def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: - """ - Baut das Note-Payload für mindnet_notes auf. - Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups. - """ - n = _as_dict(note) - path_arg, types_cfg_explicit = _pick_args(*args, **kwargs) - reg = _load_types_config(types_cfg_explicit) - - # Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide') - hash_source = kwargs.get("hash_source", "parsed") - hash_normalize = kwargs.get("hash_normalize", "canonical") - - fm = n.get("frontmatter") or {} - fm_type = fm.get("type") or n.get("type") or "concept" - note_type = str(fm_type) - - cfg_type = _cfg_for_type(note_type, reg) - cfg_def = _cfg_defaults(reg) - - # --- retriever_weight --- - default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0) - retriever_weight = fm.get("retriever_weight") - if retriever_weight is None: - retriever_weight = cfg_type.get( - "retriever_weight", - cfg_def.get("retriever_weight", default_rw), - ) - try: - retriever_weight = float(retriever_weight) - except Exception: - retriever_weight = default_rw - - # --- chunk_profile (FIXED LOGIC) --- - # 1. Frontmatter Override (beide Schreibweisen erlaubt) - chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") - - # 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml) - if chunk_profile is None: - chunk_profile = cfg_type.get("chunking_profile") - - # 3. Default Config (Fallback auf sliding_standard statt medium) - if chunk_profile is None: - chunk_profile = cfg_def.get("chunking_profile", "sliding_standard") - - # 4. Safety Fallback - if not isinstance(chunk_profile, str) or not chunk_profile: - chunk_profile = "sliding_standard" - - # --- edge_defaults --- - edge_defaults = fm.get("edge_defaults") - if edge_defaults is None: - edge_defaults = cfg_type.get( - "edge_defaults", - cfg_def.get("edge_defaults", []), - ) - edge_defaults = _ensure_list(edge_defaults) - - # --- Basis-Metadaten --- - note_id = n.get("note_id") or n.get("id") or fm.get("id") - title = n.get("title") or fm.get("title") or "" - path = n.get("path") or path_arg - if isinstance(path, pathlib.Path): - path = str(path) - - payload: Dict[str, Any] = { - "note_id": note_id, - "title": title, - "type": note_type, - "path": path or "", - "retriever_weight": retriever_weight, - "chunk_profile": chunk_profile, - "edge_defaults": edge_defaults, - "hashes": {} # Init Hash Dict - } - - # --- MULTI-HASH CALCULATION (Strategy Decoupling) --- - # Wir berechnen immer BEIDE Strategien und speichern sie. - # ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird. - modes_to_calc = ["body", "full"] - - for mode in modes_to_calc: - content_to_hash = _get_hash_source_content(n, mode) - computed_hash = _compute_hash(content_to_hash) - # Key Schema: mode:source:normalize (z.B. "full:parsed:canonical") - key = f"{mode}:{hash_source}:{hash_normalize}" - payload["hashes"][key] = computed_hash - - # Tags / Keywords - tags = fm.get("tags") or fm.get("keywords") or n.get("tags") - if tags: - payload["tags"] = _ensure_list(tags) - - # Aliases - aliases = fm.get("aliases") - if aliases: - payload["aliases"] = _ensure_list(aliases) - - # Zeit - for k in ("created", "modified", "date"): - v = fm.get(k) or n.get(k) - if v: - payload[k] = str(v) - - # Fulltext - if "body" in n and n["body"]: - payload["fulltext"] = str(n["body"]) - - # JSON Validation - json.loads(json.dumps(payload, ensure_ascii=False)) - - return payload \ No newline at end of file diff --git a/app/services/semantic_analyzer.py b/app/services/semantic_analyzer.py deleted file mode 100644 index 2d492a5..0000000 --- a/app/services/semantic_analyzer.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -FILE: app/services/semantic_analyzer.py -DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen. - WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary). - WP-22: Integration von valid_types zur Halluzinations-Vermeidung. -FIX: Mistral-sicheres JSON-Parsing ( & [OUT] Handling) und 100% Logik-Erhalt. -VERSION: 2.2.6 -STATUS: Active -DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging, re -""" - -import json -import logging -import re -from typing import List, Optional, Any -from dataclasses import dataclass - -# Importe -from app.services.llm_service import LLMService -# WP-22: Registry für Vokabular-Erzwingung -from app.services.edge_registry import registry as edge_registry - -logger = logging.getLogger(__name__) - -class SemanticAnalyzer: - def __init__(self): - self.llm = LLMService() - - def _is_valid_edge_string(self, edge_str: str) -> bool: - """ - Prüft, ob ein String eine valide Kante im Format 'kind:target' ist. - Verhindert, dass LLM-Geschwätz als Kante durchrutscht. - """ - if not isinstance(edge_str, str) or ":" not in edge_str: - return False - - parts = edge_str.split(":", 1) - kind = parts[0].strip() - target = parts[1].strip() - - # Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten. - if " " in kind: - return False - - # Regel 2: Plausible Länge für den Typ (Vermeidet Sätze als Typ) - if len(kind) > 40 or len(kind) < 2: - return False - - # Regel 3: Target darf nicht leer sein - if not target: - return False - - return True - - def _extract_json_safely(self, text: str) -> Any: - """ - Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama). - Implementiert robuste Recovery-Logik für Cloud-Provider. - """ - if not text: - return [] - - # 1. Entferne Mistral/Llama Steuerzeichen und Tags - clean = text.replace("", "").replace("", "") - clean = clean.replace("[OUT]", "").replace("[/OUT]", "") - clean = clean.strip() - - # 2. Suche nach Markdown JSON-Blöcken - match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) - payload = match.group(1) if match else clean - - try: - return json.loads(payload.strip()) - except json.JSONDecodeError: - # 3. Recovery: Suche nach der ersten [ und letzten ] - start = payload.find('[') - end = payload.rfind(']') + 1 - if start != -1 and end > start: - try: - return json.loads(payload[start:end]) - except: pass - - # 4. Zweite Recovery: Suche nach der ersten { und letzten } - start_obj = payload.find('{') - end_obj = payload.rfind('}') + 1 - if start_obj != -1 and end_obj > start_obj: - try: - return json.loads(payload[start_obj:end_obj]) - except: pass - return [] - - async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]: - """ - Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM. - Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind. - WP-20: Nutzt primär den konfigurierten Provider (z.B. OpenRouter). - """ - if not all_edges: - return [] - - # 1. Bestimmung des Providers und Modells (Dynamisch über Settings) - provider = self.llm.settings.MINDNET_LLM_PROVIDER - model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL - - # 2. Prompt laden (Provider-spezifisch via get_prompt) - prompt_template = self.llm.get_prompt("edge_allocation_template", provider) - - if not prompt_template or not isinstance(prompt_template, str): - logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' ungültig. Nutze Recovery-Template.") - prompt_template = ( - "TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n" - "TEXT: {chunk_text}\n" - "KANDIDATEN: {edge_list}\n" - "OUTPUT: JSON Liste von Strings [\"kind:target\"]." - ) - - # 3. Daten für Template vorbereiten (Vokabular-Check) - edge_registry.ensure_latest() - valid_types_str = ", ".join(sorted(list(edge_registry.valid_types))) - edges_str = "\n".join([f"- {e}" for e in all_edges]) - - logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.") - - # 4. Prompt füllen mit Format-Check (Kein Shortcut) - try: - # Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster - final_prompt = prompt_template.format( - chunk_text=chunk_text[:6000], - edge_list=edges_str, - valid_types=valid_types_str - ) - except Exception as format_err: - logger.error(f"❌ [SemanticAnalyzer] Prompt Formatting failed: {format_err}") - return [] - - try: - # 5. LLM Call mit Background Priority & Semaphore Control - response_json = await self.llm.generate_raw_response( - prompt=final_prompt, - force_json=True, - max_retries=3, - base_delay=2.0, - priority="background", - provider=provider, - model_override=model - ) - - # 6. Mistral-sicheres JSON Parsing via Helper - data = self._extract_json_safely(response_json) - - if not data: - return [] - - # 7. Robuste Normalisierung (List vs Dict Recovery) - raw_candidates = [] - if isinstance(data, list): - raw_candidates = data - elif isinstance(data, dict): - logger.info(f"ℹ️ [SemanticAnalyzer] LLM returned dict, trying recovery.") - for key in ["edges", "results", "kanten", "matches"]: - if key in data and isinstance(data[key], list): - raw_candidates.extend(data[key]) - break - # Falls immer noch leer, nutze Schlüssel-Wert Paare als Behelf - if not raw_candidates: - for k, v in data.items(): - if isinstance(v, str): raw_candidates.append(f"{k}:{v}") - elif isinstance(v, list): - for target in v: - if isinstance(target, str): raw_candidates.append(f"{k}:{target}") - - # 8. Strikte Validierung gegen Kanten-Format - valid_edges = [] - for e in raw_candidates: - e_str = str(e).strip() - if self._is_valid_edge_string(e_str): - valid_edges.append(e_str) - else: - logger.debug(f" [SemanticAnalyzer] Rejected invalid edge format: '{e_str}'") - - if valid_edges: - logger.info(f"✅ [SemanticAnalyzer] Assigned {len(valid_edges)} edges to chunk.") - return valid_edges - - except Exception as e: - logger.error(f"💥 [SemanticAnalyzer] Critical error during analysis: {e}", exc_info=True) - return [] - - async def close(self): - if self.llm: - await self.llm.close() - -# Singleton Instanziierung -_analyzer_instance = None -def get_semantic_analyzer(): - global _analyzer_instance - if _analyzer_instance is None: - _analyzer_instance = SemanticAnalyzer() - return _analyzer_instance \ No newline at end of file diff --git a/scripts/audit_chunks.py b/scripts/audit_chunks.py index 6311141..65ac7a1 100644 --- a/scripts/audit_chunks.py +++ b/scripts/audit_chunks.py @@ -2,7 +2,7 @@ from __future__ import annotations import argparse, os, json, glob, statistics as stats from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.chunker import assemble_chunks +from app.core.chunking import assemble_chunks def iter_md(root: str): for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True): diff --git a/scripts/debug_edge_loss.py b/scripts/debug_edge_loss.py index e88d2f3..ed91423 100644 --- a/scripts/debug_edge_loss.py +++ b/scripts/debug_edge_loss.py @@ -6,7 +6,7 @@ from pathlib import Path # Pfad-Setup sys.path.insert(0, os.path.abspath(".")) -from app.core.chunker import assemble_chunks, _extract_all_edges_from_md +from app.core.chunking import assemble_chunks, _extract_all_edges_from_md from app.core.derive_edges import build_edges_for_note # Mock für Settings, falls nötig diff --git a/scripts/dump_note_chunks.py b/scripts/dump_note_chunks.py index 8aba330..54b8514 100644 --- a/scripts/dump_note_chunks.py +++ b/scripts/dump_note_chunks.py @@ -2,7 +2,7 @@ from __future__ import annotations import argparse, os, glob from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.chunker import assemble_chunks +from app.core.chunking import assemble_chunks def iter_md(root: str): return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] diff --git a/scripts/fix_frontmatter.py b/scripts/fix_frontmatter.py index fa9edc1..b5f04d0 100644 --- a/scripts/fix_frontmatter.py +++ b/scripts/fix_frontmatter.py @@ -7,7 +7,7 @@ from slugify import slugify from app.core.parser import read_markdown, normalize_frontmatter from app.core.parser import FRONTMATTER_RE # für Re-Inject from app.core.validate_note import validate_note_payload -from app.core.note_payload import make_note_payload +from app.core.ingestion.ingestion_note_payload import make_note_payload DATE_IN_NAME = re.compile(r"(?P\d{4})[-_\.]?(?P\d{2})[-_\.]?(?P\d{2})") diff --git a/scripts/parse_validate_notes.py b/scripts/parse_validate_notes.py index 1fc5f66..d341fed 100644 --- a/scripts/parse_validate_notes.py +++ b/scripts/parse_validate_notes.py @@ -8,6 +8,8 @@ from jsonschema import ValidationError from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter from app.core.note_payload import make_note_payload from app.core.validate_note import validate_note_payload +from app.core.ingestion.ingestion_note_payload import make_note_payload + def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]: # include z.B. "**/*.md" diff --git a/scripts/payload_dryrun.py b/scripts/payload_dryrun.py index ce3980a..f2ee242 100644 --- a/scripts/payload_dryrun.py +++ b/scripts/payload_dryrun.py @@ -10,9 +10,9 @@ import argparse, os, json from typing import Any, Dict, List, Optional from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.note_payload import make_note_payload -from app.core.chunker import assemble_chunks -from app.core.chunk_payload import make_chunk_payloads +from app.core.chunking import assemble_chunks +from app.core.ingestion.ingestion_note_payload import make_note_payload +from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads try: from app.core.derive_edges import build_edges_for_note except Exception: diff --git a/scripts/preview_chunks.py b/scripts/preview_chunks.py index 9046d2a..25bb25a 100644 --- a/scripts/preview_chunks.py +++ b/scripts/preview_chunks.py @@ -2,9 +2,10 @@ from __future__ import annotations import argparse, os, glob, json from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.chunker import assemble_chunks -from app.core.chunk_payload import make_chunk_payloads -from app.core.note_payload import make_note_payload +from app.core.chunking import assemble_chunks +from app.core.ingestion.ingestion_note_payload import make_note_payload +from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads + def iter_md(root: str) -> list[str]: return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] From 21cda0072ab71f9b5ce543ebc04d161fb77dd3d4 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 14:26:42 +0100 Subject: [PATCH 12/23] refacturing parser --- app/core/parser.py | 305 ++------------------------- app/core/parsing/__init__.py | 17 ++ app/core/parsing/parsing_markdown.py | 60 ++++++ app/core/parsing/parsing_models.py | 22 ++ app/core/parsing/parsing_scanner.py | 25 +++ app/core/parsing/parsing_utils.py | 69 ++++++ 6 files changed, 210 insertions(+), 288 deletions(-) create mode 100644 app/core/parsing/__init__.py create mode 100644 app/core/parsing/parsing_markdown.py create mode 100644 app/core/parsing/parsing_models.py create mode 100644 app/core/parsing/parsing_scanner.py create mode 100644 app/core/parsing/parsing_utils.py diff --git a/app/core/parser.py b/app/core/parser.py index 7d183c0..5b12260 100644 --- a/app/core/parser.py +++ b/app/core/parser.py @@ -1,293 +1,22 @@ """ FILE: app/core/parser.py -DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body. - WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry. - WP-15b: Implementierung NoteContext und pre_scan_markdown für Pass 1 Ingestion. -VERSION: 1.9.0 -STATUS: Active -DEPENDENCIES: yaml, re, dataclasses, json, io, os -LAST_ANALYSIS: 2025-12-26 +DESCRIPTION: Facade für das Parsing-Package. Stellt 100% Kompatibilität sicher. + WP-14: Modularisierung abgeschlossen. +VERSION: 1.10.0 """ -from __future__ import annotations +from .parsing.parsing_models import ParsedNote, NoteContext +from .parsing.parsing_utils import ( + FRONTMATTER_RE, validate_required_frontmatter, + normalize_frontmatter, extract_wikilinks, extract_edges_with_context +) +from .parsing.parsing_markdown import read_markdown +from .parsing.parsing_scanner import pre_scan_markdown -from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple, Iterable, List -import io -import json -import os -import re +# Kompatibilitäts-Aliase +FRONTMATTER_END = FRONTMATTER_RE -try: - import yaml # PyYAML -except Exception as e: # pragma: no cover - yaml = None # Fehler wird zur Laufzeit geworfen, falls wirklich benötigt - - -# --------------------------------------------------------------------- -# Datamodell -# --------------------------------------------------------------------- - -@dataclass -class ParsedNote: - frontmatter: Dict[str, Any] - body: str - path: str - -@dataclass -class NoteContext: - """Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1).""" - note_id: str - title: str - type: str - summary: str - tags: List[str] - - -# --------------------------------------------------------------------- -# Frontmatter-Erkennung -# --------------------------------------------------------------------- - -# Öffentliche Kompatibilitäts-Konstante: frühere Skripte importieren FRONTMATTER_RE -FRONTMATTER_RE = re.compile(r"^\s*---\s*$") # <— public -# Zusätzlich interner Alias (falls jemand ihn referenziert) -FRONTMATTER_END = FRONTMATTER_RE # <— public alias - -# interne Namen bleiben bestehen -_FRONTMATTER_HEAD = FRONTMATTER_RE -_FRONTMATTER_END = FRONTMATTER_RE - - -def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]: - """ - Zerlegt Text in (frontmatter: dict, body: str). - Erkennt Frontmatter nur, wenn die erste Zeile '---' ist und später ein zweites '---' folgt. - YAML-Fehler im Frontmatter führen NICHT zum Abbruch: es wird dann ein leeres dict benutzt. - """ - lines = text.splitlines(True) # keep line endings - if not lines: - return {}, "" - - if not _FRONTMATTER_HEAD.match(lines[0]): - # kein Frontmatter-Header → gesamter Text ist Body - return {}, text - - end_idx = None - # Suche nach nächstem '---' (max. 2000 Zeilen als Sicherheitslimit) - for i in range(1, min(len(lines), 2000)): - if _FRONTMATTER_END.match(lines[i]): - end_idx = i - break - - if end_idx is None: - # unvollständiger Frontmatter-Block → behandle alles als Body - return {}, text - - fm_raw = "".join(lines[1:end_idx]) - body = "".join(lines[end_idx + 1:]) - - data: Dict[str, Any] = {} - if yaml is None: - raise RuntimeError("PyYAML ist nicht installiert (pip install pyyaml).") - - try: - loaded = yaml.safe_load(fm_raw) or {} - if isinstance(loaded, dict): - data = loaded - else: - data = {} - except Exception as e: - # YAML-Fehler nicht fatal machen - print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)})) - data = {} - - # optionales kosmetisches Trim: eine führende Leerzeile im Body entfernen - if body.startswith("\n"): - body = body[1:] - - return data, body - - -# --------------------------------------------------------------------- -# Robustes Lesen mit Encoding-Fallback -# --------------------------------------------------------------------- - -_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1") - - -def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]: - """ - Liest Datei mit mehreren Decodierungsversuchen. - Rückgabe: (text, used_encoding, had_fallback) - - had_fallback=True, falls NICHT 'utf-8' verwendet wurde (oder 'utf-8-sig'). - """ - last_err: Optional[str] = None - for enc in _FALLBACK_ENCODINGS: - try: - with io.open(path, "r", encoding=enc, errors="strict") as f: - text = f.read() - # 'utf-8-sig' zählt hier als Fallback (weil BOM), aber ist unproblematisch - return text, enc, (enc != "utf-8") - except UnicodeDecodeError as e: - last_err = f"{type(e).__name__}: {e}" - continue - - # Letzter, extrem defensiver Fallback: Bytes → UTF-8 mit REPLACE (keine Exception) - with open(path, "rb") as fb: - raw = fb.read() - text = raw.decode("utf-8", errors="replace") - print(json.dumps({ - "path": path, - "warn": "encoding_fallback_exhausted", - "info": last_err or "unknown" - }, ensure_ascii=False)) - return text, "utf-8(replace)", True - - -# --------------------------------------------------------------------- -# Öffentliche API -# --------------------------------------------------------------------- - -def read_markdown(path: str) -> Optional[ParsedNote]: - """ - Liest eine Markdown-Datei fehlertolerant. - """ - if not os.path.exists(path): - return None - - text, enc, had_fb = _read_text_with_fallback(path) - if had_fb: - print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": enc}, ensure_ascii=False)) - - fm, body = _split_frontmatter(text) - return ParsedNote(frontmatter=fm or {}, body=body or "", path=path) - - -def pre_scan_markdown(path: str) -> Optional[NoteContext]: - """ - WP-15b: Schneller Scan für den LocalBatchCache (Pass 1). - Extrahiert nur Identität und Kurz-Kontext zur semantischen Validierung. - """ - parsed = read_markdown(path) - if not parsed: - return None - - fm = parsed.frontmatter - # ID-Findung: Frontmatter ID oder Dateiname als Fallback - note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0]) - - # Erstelle Kurz-Zusammenfassung (erste 500 Zeichen des Body, bereinigt) - clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip() - summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body - - return NoteContext( - note_id=note_id, - title=str(fm.get("title", note_id)), - type=str(fm.get("type", "concept")), - summary=summary, - tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else [] - ) - - -def validate_required_frontmatter(fm: Dict[str, Any], - required: Tuple[str, ...] = ("id", "title")) -> None: - """ - Prüft, ob alle Pflichtfelder vorhanden sind. - """ - if fm is None: - fm = {} - missing = [] - for k in required: - v = fm.get(k) - if v is None: - missing.append(k) - elif isinstance(v, str) and not v.strip(): - missing.append(k) - if missing: - raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}") - - if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)): - raise ValueError("frontmatter 'tags' must be a list of strings") - - -def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]: - """ - Normalisierung von Tags und anderen Feldern. - """ - out = dict(fm or {}) - if "tags" in out: - if isinstance(out["tags"], str): - out["tags"] = [out["tags"].strip()] if out["tags"].strip() else [] - elif isinstance(out["tags"], list): - out["tags"] = [str(t).strip() for t in out["tags"] if t is not None] - else: - out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else [] - if "embedding_exclude" in out: - out["embedding_exclude"] = bool(out["embedding_exclude"]) - return out - - -# ------------------------------ Wikilinks ---------------------------- # - -_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") - - -def extract_wikilinks(text: str) -> List[str]: - """ - Extrahiert Wikilinks als einfache Liste von IDs. - """ - if not text: - return [] - out: List[str] = [] - for m in _WIKILINK_RE.finditer(text): - raw = (m.group(1) or "").strip() - if not raw: - continue - if "|" in raw: - raw = raw.split("|", 1)[0].strip() - if "#" in raw: - raw = raw.split("#", 1)[0].strip() - if raw: - out.append(raw) - return out - - -def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]: - """ - WP-22: Extrahiert Wikilinks [[Ziel|Typ]] aus dem Body und speichert die Zeilennummer. - Gibt eine Liste von Dictionaries zurück, die direkt von der Ingestion verarbeitet werden können. - """ - edges = [] - if not parsed or not parsed.body: - return edges - - # Wir nutzen splitlines(True), um Zeilenumbrüche für die Positionsberechnung zu erhalten, - # oder einfaches splitlines() für die reine Zeilennummerierung. - lines = parsed.body.splitlines() - - for line_num, line_content in enumerate(lines, 1): - for match in _WIKILINK_RE.finditer(line_content): - raw = (match.group(1) or "").strip() - if not raw: - continue - - # Syntax: [[Ziel|Typ]] - if "|" in raw: - parts = raw.split("|", 1) - target = parts[0].strip() - kind = parts[1].strip() - else: - target = raw.strip() - kind = "related_to" # Default-Typ - - # Anchor (#) entfernen, da Relationen auf Notiz-Ebene (ID) basieren - if "#" in target: - target = target.split("#", 1)[0].strip() - - if target: - edges.append({ - "to": target, - "kind": kind, - "line": line_num, - "provenance": "explicit" - }) - return edges \ No newline at end of file +__all__ = [ + "ParsedNote", "NoteContext", "FRONTMATTER_RE", "FRONTMATTER_END", + "read_markdown", "pre_scan_markdown", "validate_required_frontmatter", + "normalize_frontmatter", "extract_wikilinks", "extract_edges_with_context" +] \ No newline at end of file diff --git a/app/core/parsing/__init__.py b/app/core/parsing/__init__.py new file mode 100644 index 0000000..ae1b513 --- /dev/null +++ b/app/core/parsing/__init__.py @@ -0,0 +1,17 @@ +""" +FILE: app/core/parsing/__init__.py +DESCRIPTION: Package-Exporte für den Parser. +""" +from .parsing_models import ParsedNote, NoteContext +from .parsing_utils import ( + FRONTMATTER_RE, validate_required_frontmatter, + normalize_frontmatter, extract_wikilinks, extract_edges_with_context +) +from .parsing_markdown import read_markdown +from .parsing_scanner import pre_scan_markdown + +__all__ = [ + "ParsedNote", "NoteContext", "FRONTMATTER_RE", "read_markdown", + "pre_scan_markdown", "validate_required_frontmatter", + "normalize_frontmatter", "extract_wikilinks", "extract_edges_with_context" +] \ No newline at end of file diff --git a/app/core/parsing/parsing_markdown.py b/app/core/parsing/parsing_markdown.py new file mode 100644 index 0000000..a7e0f92 --- /dev/null +++ b/app/core/parsing/parsing_markdown.py @@ -0,0 +1,60 @@ +""" +FILE: app/core/parsing/parsing_markdown.py +DESCRIPTION: Fehlertolerantes Einlesen von Markdown und Frontmatter-Splitting. +""" +import io +import os +import json +from typing import Any, Dict, Optional, Tuple +from .parsing_models import ParsedNote +from .parsing_utils import FRONTMATTER_RE + +try: + import yaml +except ImportError: + yaml = None + +_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1") + +def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]: + """Zerlegt Text in Frontmatter-Dict und Body.""" + lines = text.splitlines(True) + if not lines or not FRONTMATTER_RE.match(lines[0]): + return {}, text + end_idx = None + for i in range(1, min(len(lines), 2000)): + if FRONTMATTER_RE.match(lines[i]): + end_idx = i + break + if end_idx is None: return {}, text + fm_raw = "".join(lines[1:end_idx]) + body = "".join(lines[end_idx + 1:]) + if yaml is None: raise RuntimeError("PyYAML not installed.") + try: + loaded = yaml.safe_load(fm_raw) or {} + data = loaded if isinstance(loaded, dict) else {} + except Exception as e: + print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)})) + data = {} + if body.startswith("\n"): body = body[1:] + return data, body + +def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]: + """Liest Datei mit Encoding-Fallback-Kette.""" + last_err = None + for enc in _FALLBACK_ENCODINGS: + try: + with io.open(path, "r", encoding=enc, errors="strict") as f: + return f.read(), enc, (enc != "utf-8") + except UnicodeDecodeError as e: + last_err = str(e); continue + with open(path, "rb") as fb: + text = fb.read().decode("utf-8", errors="replace") + return text, "utf-8(replace)", True + +def read_markdown(path: str) -> Optional[ParsedNote]: + """Öffentliche API zum Einlesen einer Datei.""" + if not os.path.exists(path): return None + text, enc, had_fb = _read_text_with_fallback(path) + fm, body = _split_frontmatter(text) + return ParsedNote(frontmatter=fm or {}, body=body or "", path=path) \ No newline at end of file diff --git a/app/core/parsing/parsing_models.py b/app/core/parsing/parsing_models.py new file mode 100644 index 0000000..c77ee4b --- /dev/null +++ b/app/core/parsing/parsing_models.py @@ -0,0 +1,22 @@ +""" +FILE: app/core/parsing/parsing_models.py +DESCRIPTION: Datenklassen für das Parsing-System. +""" +from dataclasses import dataclass +from typing import Any, Dict, List + +@dataclass +class ParsedNote: + """Container für eine vollständig eingelesene Markdown-Datei.""" + frontmatter: Dict[str, Any] + body: str + path: str + +@dataclass +class NoteContext: + """Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1).""" + note_id: str + title: str + type: str + summary: str + tags: List[str] \ No newline at end of file diff --git a/app/core/parsing/parsing_scanner.py b/app/core/parsing/parsing_scanner.py new file mode 100644 index 0000000..00e3135 --- /dev/null +++ b/app/core/parsing/parsing_scanner.py @@ -0,0 +1,25 @@ +""" +FILE: app/core/parsing/parsing_scanner.py +DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1). +""" +import os +import re +from typing import Optional +from .parsing_models import NoteContext +from .parsing_markdown import read_markdown + +def pre_scan_markdown(path: str) -> Optional[NoteContext]: + """Extrahiert Identität und Kurz-Kontext zur Validierung.""" + parsed = read_markdown(path) + if not parsed: return None + fm = parsed.frontmatter + note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0]) + clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip() + summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body + return NoteContext( + note_id=note_id, + title=str(fm.get("title", note_id)), + type=str(fm.get("type", "concept")), + summary=summary, + tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else [] + ) \ No newline at end of file diff --git a/app/core/parsing/parsing_utils.py b/app/core/parsing/parsing_utils.py new file mode 100644 index 0000000..9ea6a20 --- /dev/null +++ b/app/core/parsing/parsing_utils.py @@ -0,0 +1,69 @@ +""" +FILE: app/core/parsing/parsing_utils.py +DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion. +""" +import re +from typing import Any, Dict, List, Tuple, Optional +from .parsing_models import ParsedNote + +# Öffentliche Konstanten für Abwärtskompatibilität +FRONTMATTER_RE = re.compile(r"^\s*---\s*$") +_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") + +def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None: + """Prüft, ob alle Pflichtfelder vorhanden sind.""" + if fm is None: fm = {} + missing = [] + for k in required: + v = fm.get(k) + if v is None or (isinstance(v, str) and not v.strip()): + missing.append(k) + if missing: + raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}") + if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)): + raise ValueError("frontmatter 'tags' must be a list of strings") + +def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]: + """Normalisierung von Tags und Boolean-Feldern.""" + out = dict(fm or {}) + if "tags" in out: + if isinstance(out["tags"], str): + out["tags"] = [out["tags"].strip()] if out["tags"].strip() else [] + elif isinstance(out["tags"], list): + out["tags"] = [str(t).strip() for t in out["tags"] if t is not None] + else: + out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else [] + if "embedding_exclude" in out: + out["embedding_exclude"] = bool(out["embedding_exclude"]) + return out + +def extract_wikilinks(text: str) -> List[str]: + """Extrahiert Wikilinks als einfache Liste von IDs.""" + if not text: return [] + out: List[str] = [] + for m in _WIKILINK_RE.finditer(text): + raw = (m.group(1) or "").strip() + if not raw: continue + if "|" in raw: raw = raw.split("|", 1)[0].strip() + if "#" in raw: raw = raw.split("#", 1)[0].strip() + if raw: out.append(raw) + return out + +def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]: + """WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry.""" + edges = [] + if not parsed or not parsed.body: return edges + lines = parsed.body.splitlines() + for line_num, line_content in enumerate(lines, 1): + for match in _WIKILINK_RE.finditer(line_content): + raw = (match.group(1) or "").strip() + if not raw: continue + if "|" in raw: + parts = raw.split("|", 1) + target, kind = parts[0].strip(), parts[1].strip() + else: + target, kind = raw.strip(), "related_to" + if "#" in target: target = target.split("#", 1)[0].strip() + if target: + edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"}) + return edges \ No newline at end of file From ecb35fb869b81158b7dd74d6b1b46f9ebd948cd5 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 14:31:12 +0100 Subject: [PATCH 13/23] parser refactured WP15b --- app/core/{parser.py => parser/__init__.py} | 16 ++++++++-------- .../{parsing => parser}/parsing_markdown.py | 0 app/core/{parsing => parser}/parsing_models.py | 0 app/core/{parsing => parser}/parsing_scanner.py | 0 app/core/{parsing => parser}/parsing_utils.py | 0 app/core/parsing/__init__.py | 17 ----------------- 6 files changed, 8 insertions(+), 25 deletions(-) rename app/core/{parser.py => parser/__init__.py} (52%) rename app/core/{parsing => parser}/parsing_markdown.py (100%) rename app/core/{parsing => parser}/parsing_models.py (100%) rename app/core/{parsing => parser}/parsing_scanner.py (100%) rename app/core/{parsing => parser}/parsing_utils.py (100%) delete mode 100644 app/core/parsing/__init__.py diff --git a/app/core/parser.py b/app/core/parser/__init__.py similarity index 52% rename from app/core/parser.py rename to app/core/parser/__init__.py index 5b12260..3641394 100644 --- a/app/core/parser.py +++ b/app/core/parser/__init__.py @@ -1,18 +1,18 @@ """ -FILE: app/core/parser.py -DESCRIPTION: Facade für das Parsing-Package. Stellt 100% Kompatibilität sicher. - WP-14: Modularisierung abgeschlossen. +FILE: app/core/parser/__init__.py +DESCRIPTION: Package-Einstiegspunkt für den Parser. + Ermöglicht das Löschen der parser.py Facade. VERSION: 1.10.0 """ -from .parsing.parsing_models import ParsedNote, NoteContext -from .parsing.parsing_utils import ( +from .parsing_models import ParsedNote, NoteContext +from .parsing_utils import ( FRONTMATTER_RE, validate_required_frontmatter, normalize_frontmatter, extract_wikilinks, extract_edges_with_context ) -from .parsing.parsing_markdown import read_markdown -from .parsing.parsing_scanner import pre_scan_markdown +from .parsing_markdown import read_markdown +from .parsing_scanner import pre_scan_markdown -# Kompatibilitäts-Aliase +# Kompatibilitäts-Alias FRONTMATTER_END = FRONTMATTER_RE __all__ = [ diff --git a/app/core/parsing/parsing_markdown.py b/app/core/parser/parsing_markdown.py similarity index 100% rename from app/core/parsing/parsing_markdown.py rename to app/core/parser/parsing_markdown.py diff --git a/app/core/parsing/parsing_models.py b/app/core/parser/parsing_models.py similarity index 100% rename from app/core/parsing/parsing_models.py rename to app/core/parser/parsing_models.py diff --git a/app/core/parsing/parsing_scanner.py b/app/core/parser/parsing_scanner.py similarity index 100% rename from app/core/parsing/parsing_scanner.py rename to app/core/parser/parsing_scanner.py diff --git a/app/core/parsing/parsing_utils.py b/app/core/parser/parsing_utils.py similarity index 100% rename from app/core/parsing/parsing_utils.py rename to app/core/parser/parsing_utils.py diff --git a/app/core/parsing/__init__.py b/app/core/parsing/__init__.py deleted file mode 100644 index ae1b513..0000000 --- a/app/core/parsing/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -FILE: app/core/parsing/__init__.py -DESCRIPTION: Package-Exporte für den Parser. -""" -from .parsing_models import ParsedNote, NoteContext -from .parsing_utils import ( - FRONTMATTER_RE, validate_required_frontmatter, - normalize_frontmatter, extract_wikilinks, extract_edges_with_context -) -from .parsing_markdown import read_markdown -from .parsing_scanner import pre_scan_markdown - -__all__ = [ - "ParsedNote", "NoteContext", "FRONTMATTER_RE", "read_markdown", - "pre_scan_markdown", "validate_required_frontmatter", - "normalize_frontmatter", "extract_wikilinks", "extract_edges_with_context" -] \ No newline at end of file From 19c96fd00f1626aeeb4d54b5fdebd086f26db608 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 14:44:44 +0100 Subject: [PATCH 14/23] graph refacturiert --- app/core/derive_edges.py | 396 +-------------------------- app/core/graph/__init__.py | 16 ++ app/core/graph/graph_db_adapter.py | 56 ++++ app/core/graph/graph_derive_edges.py | 112 ++++++++ app/core/graph/graph_extractors.py | 55 ++++ app/core/graph/graph_subgraph.py | 106 +++++++ app/core/graph/graph_utils.py | 81 ++++++ app/core/graph/graph_weights.py | 39 +++ app/core/graph_adapter.py | 251 +---------------- 9 files changed, 477 insertions(+), 635 deletions(-) create mode 100644 app/core/graph/__init__.py create mode 100644 app/core/graph/graph_db_adapter.py create mode 100644 app/core/graph/graph_derive_edges.py create mode 100644 app/core/graph/graph_extractors.py create mode 100644 app/core/graph/graph_subgraph.py create mode 100644 app/core/graph/graph_utils.py create mode 100644 app/core/graph/graph_weights.py diff --git a/app/core/derive_edges.py b/app/core/derive_edges.py index 31204c9..392d05a 100644 --- a/app/core/derive_edges.py +++ b/app/core/derive_edges.py @@ -1,394 +1,10 @@ """ FILE: app/core/derive_edges.py -DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts. - WP-15b: Integration des Candidate-Pools und Provenance-Priorisierung. - Sichert die Graph-Integrität durch confidence-basiertes De-Duplicating. -VERSION: 2.1.0 -STATUS: Active -DEPENDENCIES: re, os, yaml, typing, hashlib -EXTERNAL_CONFIG: config/types.yaml -LAST_ANALYSIS: 2025-12-26 +DESCRIPTION: Facade für das neue graph Package. + WP-14: Modularisierung abgeschlossen. +VERSION: 2.2.0 """ +from .graph.graph_derive_edges import build_edges_for_note +from .graph.graph_utils import PROVENANCE_PRIORITY -from __future__ import annotations - -import os -import re -import hashlib -from typing import Iterable, List, Optional, Tuple, Set, Dict - -try: - import yaml # optional, nur für types.yaml -except Exception: # pragma: no cover - yaml = None - -# --------------------------------------------------------------------------- # -# 1. Utilities & ID Generation -# --------------------------------------------------------------------------- # - -def _get(d: dict, *keys, default=None): - """Sicherer Zugriff auf verschachtelte Dictionary-Keys.""" - for k in keys: - if isinstance(d, dict) and k in d and d[k] is not None: - return d[k] - return default - -def _chunk_text_for_refs(chunk: dict) -> str: - """Extrahiert den relevanten Text für die Referenzsuche (bevorzugt Window).""" - return ( - _get(chunk, "window") - or _get(chunk, "text") - or _get(chunk, "content") - or _get(chunk, "raw") - or "" - ) - -def _dedupe_seq(seq: Iterable[str]) -> List[str]: - """Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge.""" - seen: Set[str] = set() - out: List[str] = [] - for s in seq: - if s not in seen: - seen.add(s) - out.append(s) - return out - -def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: - """Konstruiert ein valides Kanten-Payload-Objekt für Qdrant.""" - pl = { - "kind": kind, - "relation": kind, # Alias für Abwärtskompatibilität (v2) - "scope": scope, # "chunk" | "note" - "source_id": source_id, - "target_id": target_id, - "note_id": note_id, # Träger-Note der Kante - } - if extra: - pl.update(extra) - return pl - -def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: - """Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.""" - base = f"{kind}:{s}->{t}#{scope}" - if rule_id: - base += f"|{rule_id}" - try: - return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() - except Exception: # pragma: no cover - return base - -# --------------------------------------------------------------------------- # -# 2. Konfiguration & Provenance-Skala -# --------------------------------------------------------------------------- # - -# WP-15b: Prioritäten-Ranking für die De-Duplizierung -PROVENANCE_PRIORITY = { - "explicit:wikilink": 1.00, - "inline:rel": 0.95, - "callout:edge": 0.90, - "semantic_ai": 0.90, # Validierte KI-Kanten - "structure:belongs_to": 1.00, - "structure:order": 0.95, # next/prev - "explicit:note_scope": 1.00, - "derived:backlink": 0.90, - "edge_defaults": 0.70 # Heuristik (types.yaml) -} - -def _env(n: str, default: Optional[str] = None) -> str: - v = os.getenv(n) - return v if v is not None else (default or "") - -def _load_types_registry() -> dict: - """Lädt die YAML-Registry zur Ermittlung von Standard-Kanten.""" - p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") - if not os.path.isfile(p) or yaml is None: - return {} - try: - with open(p, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - return data - except Exception: - return {} - -def _get_types_map(reg: dict) -> dict: - if isinstance(reg, dict) and isinstance(reg.get("types"), dict): - return reg["types"] - return reg if isinstance(reg, dict) else {} - -def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]: - """Liefert die edge_defaults-Liste für den gegebenen Notiztyp.""" - types_map = _get_types_map(reg) - if note_type and isinstance(types_map, dict): - t = types_map.get(note_type) - if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list): - return [str(x) for x in t["edge_defaults"] if isinstance(x, str)] - for key in ("defaults", "default", "global"): - v = reg.get(key) - if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list): - return [str(x) for x in v["edge_defaults"] if isinstance(x, str)] - return [] - -# --------------------------------------------------------------------------- # -# 3. Parser für Links / Relationen (Core Logik v2.0.0) -# --------------------------------------------------------------------------- # - -# Normale Wikilinks (Fallback) -_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") - -# Getypte Inline-Relationen -_REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) -_REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) -_REL_TEXT = re.compile(r"rel\s*:\s*(?P[a-z_]+)\s*\[\[\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) - -def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: - """Extrahiert [[rel:KIND|Target]] und entfernt sie zur Vermeidung von Dubletten.""" - pairs: List[Tuple[str,str]] = [] - def _collect(m): - k = (m.group("kind") or "").strip().lower() - t = (m.group("target") or "").strip() - if k and t: - pairs.append((k, t)) - return "" # Link entfernen - - text = _REL_PIPE.sub(_collect, text) - text = _REL_SPACE.sub(_collect, text) - text = _REL_TEXT.sub(_collect, text) - return pairs, text - -# Obsidian Callout Parser für mehrzeilige Blöcke -_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) -_REL_LINE = re.compile(r"^(?P[a-z_]+)\s*:\s*(?P.+?)\s*$", re.IGNORECASE) -_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") - -def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: - """Verarbeitet [!edge]-Callouts und entfernt diese aus dem Textfluss.""" - if not text: - return [], text - - lines = text.splitlines() - out_pairs: List[Tuple[str,str]] = [] - keep_lines: List[str] = [] - i = 0 - - while i < len(lines): - m = _CALLOUT_START.match(lines[i]) - if not m: - keep_lines.append(lines[i]) - i += 1 - continue - - block_lines: List[str] = [] - first_rest = m.group(1) or "" - if first_rest.strip(): - block_lines.append(first_rest) - - i += 1 - while i < len(lines) and lines[i].lstrip().startswith('>'): - block_lines.append(lines[i].lstrip()[1:].lstrip()) - i += 1 - - for bl in block_lines: - mrel = _REL_LINE.match(bl) - if not mrel: - continue - kind = (mrel.group("kind") or "").strip().lower() - targets = mrel.group("targets") or "" - found = _WIKILINKS_IN_LINE.findall(targets) - if found: - for t in found: - t = t.strip() - if t: - out_pairs.append((kind, t)) - else: - for raw in re.split(r"[,;]", targets): - t = raw.strip() - if t: - out_pairs.append((kind, t)) - continue - - remainder = "\n".join(keep_lines) - return out_pairs, remainder - -def _extract_wikilinks(text: str) -> List[str]: - """Extrahiert Standard-Wikilinks aus dem verbleibenden Text.""" - ids: List[str] = [] - for m in _WIKILINK_RE.finditer(text or ""): - ids.append(m.group(1).strip()) - return ids - -# --------------------------------------------------------------------------- # -# 4. Hauptfunktion (build_edges_for_note) -# --------------------------------------------------------------------------- # - -def build_edges_for_note( - note_id: str, - chunks: List[dict], - note_level_references: Optional[List[str]] = None, - include_note_scope_refs: bool = False, -) -> List[dict]: - """ - Erzeugt und aggregiert alle Kanten für eine Note inklusive WP-15b Candidate-Processing. - Setzt Provenance-Ranking zur Graph-Stabilisierung ein. - """ - edges: List[dict] = [] - note_type = _get(chunks[0], "type") if chunks else "concept" - - # 1) Struktur-Kanten: belongs_to (Chunk -> Note) - for ch in chunks: - cid = _get(ch, "chunk_id", "id") - if not cid: - continue - edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"), - "provenance": "structure", - "rule_id": "structure:belongs_to", - "confidence": PROVENANCE_PRIORITY["structure:belongs_to"], - })) - - # 2) Struktur-Kanten: next / prev (Sequenz) - for i in range(len(chunks) - 1): - a, b = chunks[i], chunks[i + 1] - a_id = _get(a, "chunk_id", "id") - b_id = _get(b, "chunk_id", "id") - if not a_id or not b_id: - continue - edges.append(_edge("next", "chunk", a_id, b_id, note_id, { - "chunk_id": a_id, - "edge_id": _mk_edge_id("next", a_id, b_id, "chunk", "structure:order"), - "provenance": "structure", - "rule_id": "structure:order", - "confidence": PROVENANCE_PRIORITY["structure:order"], - })) - edges.append(_edge("prev", "chunk", b_id, a_id, note_id, { - "chunk_id": b_id, - "edge_id": _mk_edge_id("prev", b_id, a_id, "chunk", "structure:order"), - "provenance": "structure", - "rule_id": "structure:order", - "confidence": PROVENANCE_PRIORITY["structure:order"], - })) - - # 3) Inhaltliche Kanten (Refs, Inlines, Callouts, Candidates) - reg = _load_types_registry() - defaults = _edge_defaults_for(note_type, reg) - refs_all: List[str] = [] - - for ch in chunks: - cid = _get(ch, "chunk_id", "id") - if not cid: - continue - raw = _chunk_text_for_refs(ch) - - # 3a) Typed Inline Relations - typed, remainder = _extract_typed_relations(raw) - for kind, target in typed: - k = kind.strip().lower() - if not k or not target: continue - edges.append(_edge(k, "chunk", cid, target, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(k, cid, target, "chunk", "inline:rel"), - "provenance": "explicit", - "rule_id": "inline:rel", - "confidence": PROVENANCE_PRIORITY["inline:rel"], - })) - - # 3b) WP-15b Candidate Pool Integration (KI-validierte Kanten) - # Verarbeitet Kanten, die bereits in der Ingestion semantisch geprüft wurden. - pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] - for cand in pool: - target = cand.get("to") - kind = cand.get("kind", "related_to") - prov = cand.get("provenance", "semantic_ai") - if not target: continue - edges.append(_edge(kind, "chunk", cid, target, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(kind, cid, target, "chunk", f"candidate:{prov}"), - "provenance": prov, - "rule_id": f"candidate:{prov}", - "confidence": PROVENANCE_PRIORITY.get(prov, 0.90), - })) - - # 3c) Obsidian Callouts - call_pairs, remainder2 = _extract_callout_relations(remainder) - for kind, target in call_pairs: - k = (kind or "").strip().lower() - if not k or not target: continue - edges.append(_edge(k, "chunk", cid, target, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge"), - "provenance": "explicit", - "rule_id": "callout:edge", - "confidence": PROVENANCE_PRIORITY["callout:edge"], - })) - - # 3d) Standard-Wikilinks -> references (+ defaults) - refs = _extract_wikilinks(remainder2) - for r in refs: - edges.append(_edge("references", "chunk", cid, r, note_id, { - "chunk_id": cid, - "ref_text": r, - "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"), - "provenance": "explicit", - "rule_id": "explicit:wikilink", - "confidence": PROVENANCE_PRIORITY["explicit:wikilink"], - })) - # Regelbasierte Kanten aus types.yaml anhängen - for rel in defaults: - if rel == "references": continue - edges.append(_edge(rel, "chunk", cid, r, note_id, { - "chunk_id": cid, - "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{note_type}:{rel}"), - "provenance": "rule", - "rule_id": f"edge_defaults:{note_type}:{rel}", - "confidence": PROVENANCE_PRIORITY["edge_defaults"], - })) - - refs_all.extend(refs) - - # 4) Optionale Note-Scope Referenzen & Backlinks - if include_note_scope_refs: - refs_note = list(refs_all or []) - if note_level_references: - refs_note.extend([r for r in note_level_references if isinstance(r, str) and r]) - refs_note = _dedupe_seq(refs_note) - - for r in refs_note: - edges.append(_edge("references", "note", note_id, r, note_id, { - "edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"), - "provenance": "explicit", - "rule_id": "explicit:note_scope", - "confidence": PROVENANCE_PRIORITY["explicit:note_scope"], - })) - # Backlink-Erzeugung zur Graphen-Stärkung - edges.append(_edge("backlink", "note", r, note_id, note_id, { - "edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"), - "provenance": "rule", - "rule_id": "derived:backlink", - "confidence": PROVENANCE_PRIORITY["derived:backlink"], - })) - for rel in defaults: - if rel == "references": continue - edges.append(_edge(rel, "note", note_id, r, note_id, { - "edge_id": _mk_edge_id(rel, note_id, r, "note", f"edge_defaults:{note_type}:{rel}"), - "provenance": "rule", - "rule_id": f"edge_defaults:{note_type}:{rel}", - "confidence": PROVENANCE_PRIORITY["edge_defaults"], - })) - - # 5) WP-15b: Confidence-basierte De-Duplizierung - # Wenn dieselbe Relation mehrfach existiert, gewinnt die mit der höchsten Confidence. - unique_map: Dict[Tuple[str, str, str], dict] = {} - - for e in edges: - s, t = str(e.get("source_id")), str(e.get("target_id")) - rel = str(e.get("relation") or e.get("kind") or "edge") - key = (s, t, rel) - - if key not in unique_map: - unique_map[key] = e - else: - # Vergleich der Vertrauenswürdigkeit (Provenance Ranking) - if e.get("confidence", 0) > unique_map[key].get("confidence", 0): - unique_map[key] = e - - return list(unique_map.values()) \ No newline at end of file +__all__ = ["build_edges_for_note", "PROVENANCE_PRIORITY"] \ No newline at end of file diff --git a/app/core/graph/__init__.py b/app/core/graph/__init__.py new file mode 100644 index 0000000..e7b7ceb --- /dev/null +++ b/app/core/graph/__init__.py @@ -0,0 +1,16 @@ +""" +FILE: app/core/graph/__init__.py +DESCRIPTION: Unified Graph Package. Exportiert Kanten-Ableitung und Graph-Adapter. +""" +from .graph_derive_edges import build_edges_for_note +from .graph_utils import PROVENANCE_PRIORITY +from .graph_subgraph import Subgraph, expand +from .graph_weights import EDGE_BASE_WEIGHTS + +__all__ = [ + "build_edges_for_note", + "PROVENANCE_PRIORITY", + "Subgraph", + "expand", + "EDGE_BASE_WEIGHTS" +] \ No newline at end of file diff --git a/app/core/graph/graph_db_adapter.py b/app/core/graph/graph_db_adapter.py new file mode 100644 index 0000000..e3fff2f --- /dev/null +++ b/app/core/graph/graph_db_adapter.py @@ -0,0 +1,56 @@ +""" +FILE: app/core/graph/graph_db_adapter.py +DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen. +""" +from typing import List, Dict, Optional +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest +from app.core.qdrant import collection_names + +def fetch_edges_from_qdrant( + client: QdrantClient, + prefix: str, + seeds: List[str], + edge_types: Optional[List[str]] = None, + limit: int = 2048, +) -> List[Dict]: + """ + Holt Edges aus der Datenbank basierend auf Seed-IDs. + Filtert auf source_id, target_id oder note_id. + """ + if not seeds or limit <= 0: + return [] + + _, _, edges_col = collection_names(prefix) + + seed_conditions = [] + for field in ("source_id", "target_id", "note_id"): + for s in seeds: + seed_conditions.append( + rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s))) + ) + seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None + + type_filter = None + if edge_types: + type_conds = [ + rest.FieldCondition(key="kind", match=rest.MatchValue(value=str(k))) + for k in edge_types + ] + type_filter = rest.Filter(should=type_conds) + + must = [] + if seeds_filter: must.append(seeds_filter) + if type_filter: must.append(type_filter) + + flt = rest.Filter(must=must) if must else None + + pts, _ = client.scroll( + collection_name=edges_col, + scroll_filter=flt, + limit=limit, + with_payload=True, + with_vectors=False, + ) + + return [dict(p.payload) for p in pts if p.payload] \ No newline at end of file diff --git a/app/core/graph/graph_derive_edges.py b/app/core/graph/graph_derive_edges.py new file mode 100644 index 0000000..284e789 --- /dev/null +++ b/app/core/graph/graph_derive_edges.py @@ -0,0 +1,112 @@ +""" +FILE: app/core/graph/graph_derive_edges.py +DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung. +""" +from typing import List, Optional, Dict, Tuple +from .graph_utils import ( + _get, _edge, _mk_edge_id, _dedupe_seq, + PROVENANCE_PRIORITY, load_types_registry, get_edge_defaults_for +) +from .graph_extractors import ( + extract_typed_relations, extract_callout_relations, extract_wikilinks +) + +def build_edges_for_note( + note_id: str, + chunks: List[dict], + note_level_references: Optional[List[str]] = None, + include_note_scope_refs: bool = False, +) -> List[dict]: + """Erzeugt und aggregiert alle Kanten für eine Note (WP-15b).""" + edges: List[dict] = [] + note_type = _get(chunks[0], "type") if chunks else "concept" + + # 1) Struktur-Kanten (belongs_to, next/prev) + for idx, ch in enumerate(chunks): + cid = _get(ch, "chunk_id", "id") + if not cid: continue + edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, { + "chunk_id": cid, "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"), + "provenance": "structure", "rule_id": "structure:belongs_to", "confidence": PROVENANCE_PRIORITY["structure:belongs_to"] + })) + if idx < len(chunks) - 1: + next_id = _get(chunks[idx+1], "chunk_id", "id") + if next_id: + edges.append(_edge("next", "chunk", cid, next_id, note_id, { + "chunk_id": cid, "edge_id": _mk_edge_id("next", cid, next_id, "chunk", "structure:order"), + "provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"] + })) + edges.append(_edge("prev", "chunk", next_id, cid, note_id, { + "chunk_id": next_id, "edge_id": _mk_edge_id("prev", next_id, cid, "chunk", "structure:order"), + "provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"] + })) + + # 2) Inhaltliche Kanten + reg = load_types_registry() + defaults = get_edge_defaults_for(note_type, reg) + refs_all: List[str] = [] + + for ch in chunks: + cid = _get(ch, "chunk_id", "id") + if not cid: continue + raw = _get(ch, "window") or _get(ch, "text") or "" + + # Typed & Candidate Pool (WP-15b Integration) + typed, rem = extract_typed_relations(raw) + for k, t in typed: + edges.append(_edge(k, "chunk", cid, t, note_id, { + "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel"), + "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"] + })) + + pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] + for cand in pool: + t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai") + if t: + edges.append(_edge(k, "chunk", cid, t, note_id, { + "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}"), + "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90) + })) + + # Callouts & Wikilinks + call_pairs, rem2 = extract_callout_relations(rem) + for k, t in call_pairs: + edges.append(_edge(k, "chunk", cid, t, note_id, { + "chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge"), + "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"] + })) + + refs = extract_wikilinks(rem2) + for r in refs: + edges.append(_edge("references", "chunk", cid, r, note_id, { + "chunk_id": cid, "ref_text": r, "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"), + "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"] + })) + for rel in defaults: + if rel != "references": + edges.append(_edge(rel, "chunk", cid, r, note_id, { + "chunk_id": cid, "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}"), + "provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"] + })) + refs_all.extend(refs) + + # 3) Note-Scope & De-Duplizierung + if include_note_scope_refs: + refs_note = _dedupe_seq((refs_all or []) + (note_level_references or [])) + for r in refs_note: + edges.append(_edge("references", "note", note_id, r, note_id, { + "edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"), + "provenance": "explicit", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"] + })) + edges.append(_edge("backlink", "note", r, note_id, note_id, { + "edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"), + "provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"] + })) + + unique_map: Dict[Tuple[str, str, str], dict] = {} + for e in edges: + key = (str(e.get("source_id")), str(e.get("target_id")), str(e.get("kind"))) + if key not in unique_map or e.get("confidence", 0) > unique_map[key].get("confidence", 0): + unique_map[key] = e + + return list(unique_map.values()) \ No newline at end of file diff --git a/app/core/graph/graph_extractors.py b/app/core/graph/graph_extractors.py new file mode 100644 index 0000000..9c1fedf --- /dev/null +++ b/app/core/graph/graph_extractors.py @@ -0,0 +1,55 @@ +""" +FILE: app/core/graph/graph_extractors.py +DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text. +""" +import re +from typing import List, Tuple + +_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") +_REL_PIPE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s*\|\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) +_REL_SPACE = re.compile(r"\[\[\s*rel:(?P[a-z_]+)\s+(?P[^\]]+?)\s*\]\]", re.IGNORECASE) +_REL_TEXT = re.compile(r"rel\s*:\s*(?P[a-z_]+)\s*\[\[\s*(?P[^\]]+?)\s*\]\]", re.IGNORECASE) + +_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE) +_REL_LINE = re.compile(r"^(?P[a-z_]+)\s*:\s*(?P.+?)\s*$", re.IGNORECASE) +_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]") + +def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: + """Extrahiert [[rel:KIND|Target]].""" + pairs = [] + def _collect(m): + k, t = (m.group("kind") or "").strip().lower(), (m.group("target") or "").strip() + if k and t: pairs.append((k, t)) + return "" + text = _REL_PIPE.sub(_collect, text) + text = _REL_SPACE.sub(_collect, text) + text = _REL_TEXT.sub(_collect, text) + return pairs, text + +def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]: + """Verarbeitet Obsidian [!edge]-Callouts.""" + if not text: return [], text + lines = text.splitlines(); out_pairs, keep_lines, i = [], [], 0 + while i < len(lines): + m = _CALLOUT_START.match(lines[i]) + if not m: + keep_lines.append(lines[i]); i += 1; continue + block_lines = [m.group(1)] if m.group(1).strip() else [] + i += 1 + while i < len(lines) and lines[i].lstrip().startswith('>'): + block_lines.append(lines[i].lstrip()[1:].lstrip()); i += 1 + for bl in block_lines: + mrel = _REL_LINE.match(bl) + if not mrel: continue + kind, targets = mrel.group("kind").strip().lower(), mrel.group("targets") or "" + found = _WIKILINKS_IN_LINE.findall(targets) + if found: + for t in found: out_pairs.append((kind, t.strip())) + else: + for raw in re.split(r"[,;]", targets): + if raw.strip(): out_pairs.append((kind, raw.strip())) + return out_pairs, "\n".join(keep_lines) + +def extract_wikilinks(text: str) -> List[str]: + """Extrahiert Standard-Wikilinks.""" + return [m.group(1).strip() for m in _WIKILINK_RE.finditer(text or "")] \ No newline at end of file diff --git a/app/core/graph/graph_subgraph.py b/app/core/graph/graph_subgraph.py new file mode 100644 index 0000000..593b09e --- /dev/null +++ b/app/core/graph/graph_subgraph.py @@ -0,0 +1,106 @@ +""" +FILE: app/core/graph/graph_subgraph.py +DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse. +""" +import math +from collections import defaultdict +from typing import Dict, List, Optional, DefaultDict, Any, Set +from qdrant_client import QdrantClient +from .graph_weights import EDGE_BASE_WEIGHTS, calculate_edge_weight +from .graph_db_adapter import fetch_edges_from_qdrant + +class Subgraph: + """Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen.""" + + def __init__(self) -> None: + self.adj: DefaultDict[str, List[Dict]] = defaultdict(list) + self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list) + self.in_degree: DefaultDict[str, int] = defaultdict(int) + self.out_degree: DefaultDict[str, int] = defaultdict(int) + + def add_edge(self, e: Dict) -> None: + """Fügt eine Kante hinzu und aktualisiert Indizes.""" + src = e.get("source") + tgt = e.get("target") + kind = e.get("kind") + weight = e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0)) + owner = e.get("note_id") + + if not src or not tgt: + return + + # 1. Forward + self.adj[src].append({"target": tgt, "kind": kind, "weight": weight}) + self.out_degree[src] += 1 + self.in_degree[tgt] += 1 + + # 2. Reverse (WP-04b Explanation) + self.reverse_adj[tgt].append({"source": src, "kind": kind, "weight": weight}) + + # 3. Kontext-Note Handling + if owner and owner != src: + self.adj[owner].append({"target": tgt, "kind": kind, "weight": weight}) + self.out_degree[owner] += 1 + if owner != tgt: + self.reverse_adj[tgt].append({"source": owner, "kind": kind, "weight": weight, "via_context": True}) + self.in_degree[owner] += 1 + + def aggregate_edge_bonus(self, node_id: str) -> float: + """Summe der ausgehenden Kantengewichte (Hub-Score).""" + return sum(edge["weight"] for edge in self.adj.get(node_id, [])) + + def edge_bonus(self, node_id: str) -> float: + """API für Retriever (WP-04a Kompatibilität).""" + return self.aggregate_edge_bonus(node_id) + + def centrality_bonus(self, node_id: str) -> float: + """Log-gedämpfte Zentralität (In-Degree).""" + indeg = self.in_degree.get(node_id, 0) + if indeg <= 0: + return 0.0 + return min(math.log1p(indeg) / 10.0, 0.15) + + def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]: + return self.adj.get(node_id, []) + + def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]: + return self.reverse_adj.get(node_id, []) + + +def expand( + client: QdrantClient, + prefix: str, + seeds: List[str], + depth: int = 1, + edge_types: Optional[List[str]] = None, +) -> Subgraph: + """Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.""" + sg = Subgraph() + frontier = set(seeds) + visited = set() + + for _ in range(max(depth, 0)): + if not frontier: + break + + payloads = fetch_edges_from_qdrant(client, prefix, list(frontier), edge_types) + next_frontier: Set[str] = set() + + for pl in payloads: + src, tgt = pl.get("source_id"), pl.get("target_id") + if not src or not tgt: continue + + sg.add_edge({ + "source": src, "target": tgt, + "kind": pl.get("kind", "edge"), + "weight": calculate_edge_weight(pl), + "note_id": pl.get("note_id"), + }) + + if tgt not in visited: + next_frontier.add(str(tgt)) + + visited |= frontier + frontier = next_frontier - visited + + return sg \ No newline at end of file diff --git a/app/core/graph/graph_utils.py b/app/core/graph/graph_utils.py new file mode 100644 index 0000000..5f295ed --- /dev/null +++ b/app/core/graph/graph_utils.py @@ -0,0 +1,81 @@ +""" +FILE: app/core/graph/graph_utils.py +DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen. +""" +import os +import hashlib +from typing import Iterable, List, Optional, Set, Any + +try: + import yaml +except ImportError: + yaml = None + +# WP-15b: Prioritäten-Ranking für die De-Duplizierung +PROVENANCE_PRIORITY = { + "explicit:wikilink": 1.00, + "inline:rel": 0.95, + "callout:edge": 0.90, + "semantic_ai": 0.90, # Validierte KI-Kanten + "structure:belongs_to": 1.00, + "structure:order": 0.95, # next/prev + "explicit:note_scope": 1.00, + "derived:backlink": 0.90, + "edge_defaults": 0.70 # Heuristik (types.yaml) +} + +def _get(d: dict, *keys, default=None): + """Sicherer Zugriff auf verschachtelte Keys.""" + for k in keys: + if isinstance(d, dict) and k in d and d[k] is not None: + return d[k] + return default + +def _dedupe_seq(seq: Iterable[str]) -> List[str]: + """Dedupliziert Strings unter Beibehaltung der Reihenfolge.""" + seen: Set[str] = set() + out: List[str] = [] + for s in seq: + if s not in seen: + seen.add(s); out.append(s) + return out + +def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: + """Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.""" + base = f"{kind}:{s}->{t}#{scope}" + if rule_id: base += f"|{rule_id}" + return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() + +def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: + """Konstruiert ein Kanten-Payload für Qdrant.""" + pl = { + "kind": kind, + "relation": kind, + "scope": scope, + "source_id": source_id, + "target_id": target_id, + "note_id": note_id, + } + if extra: pl.update(extra) + return pl + +def load_types_registry() -> dict: + """Lädt die YAML-Registry.""" + p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml") + if not os.path.isfile(p) or yaml is None: return {} + try: + with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} + except Exception: return {} + +def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]: + """Ermittelt Standard-Kanten für einen Typ.""" + types_map = reg.get("types", reg) if isinstance(reg, dict) else {} + if note_type and isinstance(types_map, dict): + t = types_map.get(note_type) + if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list): + return [str(x) for x in t["edge_defaults"] if isinstance(x, str)] + for key in ("defaults", "default", "global"): + v = reg.get(key) + if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list): + return [str(x) for x in v["edge_defaults"] if isinstance(x, str)] + return [] \ No newline at end of file diff --git a/app/core/graph/graph_weights.py b/app/core/graph/graph_weights.py new file mode 100644 index 0000000..5fc2f68 --- /dev/null +++ b/app/core/graph/graph_weights.py @@ -0,0 +1,39 @@ +""" +FILE: app/core/graph/graph_weights.py +DESCRIPTION: Definition der Basisgewichte und Berechnung der Kanteneffektivität. +""" +from typing import Dict + +# Basisgewichte je Edge-Typ (WP-04a Config) +EDGE_BASE_WEIGHTS: Dict[str, float] = { + # Struktur + "belongs_to": 0.10, + "next": 0.06, + "prev": 0.06, + "backlink": 0.04, + "references_at": 0.08, + + # Wissen + "references": 0.20, + "depends_on": 0.18, + "related_to": 0.15, + "similar_to": 0.12, +} + +def calculate_edge_weight(pl: Dict) -> float: + """Berechnet das effektive Edge-Gewicht aus kind + confidence.""" + kind = pl.get("kind", "edge") + base = EDGE_BASE_WEIGHTS.get(kind, 0.0) + + conf_raw = pl.get("confidence", None) + try: + conf = float(conf_raw) if conf_raw is not None else None + except Exception: + conf = None + + if conf is None: + return base + + # Clamp confidence 0.0 - 1.0 + conf = max(0.0, min(1.0, conf)) + return base * conf \ No newline at end of file diff --git a/app/core/graph_adapter.py b/app/core/graph_adapter.py index e4b2cb7..ee36f9e 100644 --- a/app/core/graph_adapter.py +++ b/app/core/graph_adapter.py @@ -1,249 +1,10 @@ """ FILE: app/core/graph_adapter.py -DESCRIPTION: Lädt Kanten aus Qdrant und baut einen In-Memory Subgraphen für Scoring (Centrality) und Explanation. -VERSION: 0.4.0 -STATUS: Active -DEPENDENCIES: qdrant_client, app.core.qdrant -LAST_ANALYSIS: 2025-12-15 +DESCRIPTION: Facade für das neue graph Package (Adapter-Teil). + WP-14: Modularisierung abgeschlossen. +VERSION: 0.5.0 """ +from .graph.graph_subgraph import Subgraph, expand +from .graph.graph_weights import EDGE_BASE_WEIGHTS -from __future__ import annotations - -from typing import Dict, List, Optional, DefaultDict, Any -from collections import defaultdict - -from qdrant_client import QdrantClient -from qdrant_client.http import models as rest - -from app.core.qdrant import collection_names - -# Legacy-Import Fallback -try: # pragma: no cover - from app.core.qdrant_points import get_edges_for_sources # type: ignore -except Exception: # pragma: no cover - get_edges_for_sources = None # type: ignore - - -# Basisgewichte je Edge-Typ (WP-04a Config) -EDGE_BASE_WEIGHTS: Dict[str, float] = { - # Struktur - "belongs_to": 0.10, - "next": 0.06, - "prev": 0.06, - "backlink": 0.04, - "references_at": 0.08, - - # Wissen - "references": 0.20, - "depends_on": 0.18, - "related_to": 0.15, - "similar_to": 0.12, -} - - -def _edge_weight(pl: Dict) -> float: - """Berechnet das effektive Edge-Gewicht aus kind + confidence.""" - kind = pl.get("kind", "edge") - base = EDGE_BASE_WEIGHTS.get(kind, 0.0) - - conf_raw = pl.get("confidence", None) - try: - conf = float(conf_raw) if conf_raw is not None else None - except Exception: - conf = None - - if conf is None: - return base - - if conf < 0.0: conf = 0.0 - if conf > 1.0: conf = 1.0 - - return base * conf - - -def _fetch_edges( - client: QdrantClient, - prefix: str, - seeds: List[str], - edge_types: Optional[List[str]] = None, - limit: int = 2048, -) -> List[Dict]: - """ - Holt Edges direkt aus der *_edges Collection. - Filter: source_id IN seeds OR target_id IN seeds OR note_id IN seeds - """ - if not seeds or limit <= 0: - return [] - - _, _, edges_col = collection_names(prefix) - - seed_conditions = [] - for field in ("source_id", "target_id", "note_id"): - for s in seeds: - seed_conditions.append( - rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s))) - ) - seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None - - type_filter = None - if edge_types: - type_conds = [ - rest.FieldCondition(key="kind", match=rest.MatchValue(value=str(k))) - for k in edge_types - ] - type_filter = rest.Filter(should=type_conds) - - must = [] - if seeds_filter: must.append(seeds_filter) - if type_filter: must.append(type_filter) - - flt = rest.Filter(must=must) if must else None - - pts, _ = client.scroll( - collection_name=edges_col, - scroll_filter=flt, - limit=limit, - with_payload=True, - with_vectors=False, - ) - - out: List[Dict] = [] - for p in pts or []: - pl = dict(p.payload or {}) - if pl: - out.append(pl) - return out - - -class Subgraph: - """Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen.""" - - def __init__(self) -> None: - # Forward: source -> [targets] - self.adj: DefaultDict[str, List[Dict]] = defaultdict(list) - # Reverse: target -> [sources] (Neu für WP-04b Explanation) - self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list) - - self.in_degree: DefaultDict[str, int] = defaultdict(int) - self.out_degree: DefaultDict[str, int] = defaultdict(int) - - def add_edge(self, e: Dict) -> None: - """ - Fügt eine Kante hinzu und aktualisiert Forward/Reverse Indizes. - e muss enthalten: source, target, kind, weight. - """ - src = e.get("source") - tgt = e.get("target") - kind = e.get("kind") - weight = e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0)) - owner = e.get("note_id") - - if not src or not tgt: - return - - # 1. Primäre Adjazenz (Forward) - edge_data = {"target": tgt, "kind": kind, "weight": weight} - self.adj[src].append(edge_data) - self.out_degree[src] += 1 - self.in_degree[tgt] += 1 - - # 2. Reverse Adjazenz (Neu für Explanation) - # Wir speichern, woher die Kante kam. - rev_data = {"source": src, "kind": kind, "weight": weight} - self.reverse_adj[tgt].append(rev_data) - - # 3. Kontext-Note Handling (Forward & Reverse) - # Wenn eine Kante "im Kontext einer Note" (owner) definiert ist, - # schreiben wir sie der Note gut, damit der Retriever Scores auf Note-Ebene findet. - if owner and owner != src: - # Forward: Owner -> Target - self.adj[owner].append(edge_data) - self.out_degree[owner] += 1 - - # Reverse: Target wird vom Owner referenziert (indirekt) - if owner != tgt: - rev_owner_data = {"source": owner, "kind": kind, "weight": weight, "via_context": True} - self.reverse_adj[tgt].append(rev_owner_data) - self.in_degree[owner] += 1 # Leichter Centrality Boost für den Owner - - def aggregate_edge_bonus(self, node_id: str) -> float: - """Summe der ausgehenden Kantengewichte (Hub-Score).""" - return sum(edge["weight"] for edge in self.adj.get(node_id, [])) - - def edge_bonus(self, node_id: str) -> float: - """API für Retriever (WP-04a Kompatibilität).""" - return self.aggregate_edge_bonus(node_id) - - def centrality_bonus(self, node_id: str) -> float: - """Log-gedämpfte Zentralität (In-Degree).""" - import math - indeg = self.in_degree.get(node_id, 0) - if indeg <= 0: - return 0.0 - return min(math.log1p(indeg) / 10.0, 0.15) - - # --- WP-04b Explanation Helpers --- - - def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]: - """Liefert Liste aller Ziele, auf die dieser Knoten zeigt.""" - return self.adj.get(node_id, []) - - def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]: - """Liefert Liste aller Quellen, die auf diesen Knoten zeigen.""" - return self.reverse_adj.get(node_id, []) - - -def expand( - client: QdrantClient, - prefix: str, - seeds: List[str], - depth: int = 1, - edge_types: Optional[List[str]] = None, -) -> Subgraph: - """ - Expandiert ab Seeds entlang von Edges (bis `depth`). - """ - sg = Subgraph() - frontier = set(seeds) - visited = set() - - max_depth = max(depth, 0) - - for _ in range(max_depth): - if not frontier: - break - - edges_payloads = _fetch_edges( - client=client, - prefix=prefix, - seeds=list(frontier), - edge_types=edge_types, - limit=2048, - ) - - next_frontier = set() - for pl in edges_payloads: - src = pl.get("source_id") - tgt = pl.get("target_id") - - # Skip invalid edges - if not src or not tgt: - continue - - e = { - "source": src, - "target": tgt, - "kind": pl.get("kind", "edge"), - "weight": _edge_weight(pl), - "note_id": pl.get("note_id"), - } - sg.add_edge(e) - - # Nur weitersuchen, wenn Target noch nicht besucht - if tgt and tgt not in visited: - next_frontier.add(tgt) - - visited |= frontier - frontier = next_frontier - visited - - return sg \ No newline at end of file +__all__ = ["Subgraph", "expand", "EDGE_BASE_WEIGHTS"] \ No newline at end of file From 386fa3ef0cbf8f22e1ececc85c4392531544ade5 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 18:17:13 +0100 Subject: [PATCH 15/23] =?UTF-8?q?WP15b=20vollst=C3=A4ndieg=20chunking=20st?= =?UTF-8?q?rategien?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/chunking/chunking_processor.py | 79 +++++++++++---- app/core/chunking/chunking_propagation.py | 66 +++++++++---- app/core/chunking/chunking_strategies.py | 112 +++++++++++++++++----- 3 files changed, 200 insertions(+), 57 deletions(-) diff --git a/app/core/chunking/chunking_processor.py b/app/core/chunking/chunking_processor.py index 12c9a7b..1a17acb 100644 --- a/app/core/chunking/chunking_processor.py +++ b/app/core/chunking/chunking_processor.py @@ -1,9 +1,14 @@ """ FILE: app/core/chunking/chunking_processor.py -DESCRIPTION: Hauptlogik für das Zerlegen von Markdown in Chunks. +DESCRIPTION: Der zentrale Orchestrator für das Chunking-System. + AUDIT v3.3.3: Wiederherstellung der "Gold-Standard" Qualität. + - Integriert physikalische Kanten-Injektion (Propagierung). + - Stellt H1-Kontext-Fenster sicher. + - Baut den Candidate-Pool für die WP-15b Ingestion auf. """ import asyncio import re +import logging from typing import List, Dict, Optional from .chunking_models import Chunk from .chunking_utils import get_chunk_config, extract_frontmatter_from_text @@ -11,43 +16,79 @@ from .chunking_parser import parse_blocks, parse_edges_robust from .chunking_strategies import strategy_sliding_window, strategy_by_heading from .chunking_propagation import propagate_section_edges +logger = logging.getLogger(__name__) + async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: - """Orchestriert das Chunking und baut den Candidate-Pool auf.""" - if config is None: config = get_chunk_config(note_type) + """ + Hauptfunktion zur Zerlegung einer Note. + Verbindet Strategien mit physikalischer Kontext-Anreicherung. + """ + # 1. Konfiguration & Parsing + if config is None: + config = get_chunk_config(note_type) + fm, body_text = extract_frontmatter_from_text(md_text) blocks, doc_title = parse_blocks(md_text) + # Vorbereitung des H1-Präfix für die Embedding-Fenster + h1_prefix = f"# {doc_title}" if doc_title else "" + + # 2. Anwendung der Splitting-Strategie + # Wir übergeben den Dokument-Titel/Präfix für die Window-Bildung. if config.get("strategy") == "by_heading": chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) else: - chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) + # sliding_window nutzt nun den context_prefix für das Window-Feld. + chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id, context_prefix=h1_prefix) - if not chunks: return [] + if not chunks: + return [] - # WP-15b: Candidate Pool Aufbau - chunks = propagate_section_edges(chunks, blocks) + # 3. Physikalische Kontext-Anreicherung (Der Qualitäts-Fix) + # Schreibt Kanten aus Callouts/Inlines hart in den Text für Qdrant. + chunks = propagate_section_edges(chunks) + + # 4. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService) + # Zuerst die explizit im Text vorhandenen Kanten sammeln. for ch in chunks: + # Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text. for e_str in parse_edges_robust(ch.text): - k, t = e_str.split(':', 1) - ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"}) + parts = e_str.split(':', 1) + if len(parts) == 2: + k, t = parts + ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"}) - # Global Pool (Unzugeordnete Kanten) - pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) + # 5. Global Pool (Unzugeordnete Kanten aus dem Dokument-Ende) + # Sucht nach dem Edge-Pool Block im Original-Markdown. + pool_match = re.search( + r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', + body_text, + re.DOTALL | re.IGNORECASE + ) if pool_match: - for e_str in parse_edges_robust(pool_match.group(1)): - k, t = e_str.split(':', 1) - for ch in chunks: ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"}) + global_edges = parse_edges_robust(pool_match.group(1)) + for e_str in global_edges: + parts = e_str.split(':', 1) + if len(parts) == 2: + k, t = parts + # Diese Kanten werden als "Global Pool" markiert für die spätere KI-Prüfung. + for ch in chunks: + ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"}) - # De-Duplikation + # 6. De-Duplikation des Pools & Linking for ch in chunks: - seen = set(); unique = [] + seen = set() + unique = [] for c in ch.candidate_pool: - if (c["kind"], c["to"]) not in seen: - seen.add((c["kind"], c["to"])); unique.append(c) + key = (c["kind"], c["to"], c["provenance"]) + if key not in seen: + seen.add(key) + unique.append(c) ch.candidate_pool = unique - # Nachbarschaften + # Verknüpfung der Nachbarschaften für Graph-Traversierung for i, ch in enumerate(chunks): ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None + return chunks \ No newline at end of file diff --git a/app/core/chunking/chunking_propagation.py b/app/core/chunking/chunking_propagation.py index 1aeb361..099d075 100644 --- a/app/core/chunking/chunking_propagation.py +++ b/app/core/chunking/chunking_propagation.py @@ -1,25 +1,59 @@ """ FILE: app/core/chunking/chunking_propagation.py -DESCRIPTION: Vererbung von Kanten (Inheritance) über Sektions-Pfade. +DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment). + Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her. +VERSION: 3.3.1 +STATUS: Active """ from typing import List, Dict, Set -from .chunking_models import Chunk, RawBlock +from .chunking_models import Chunk from .chunking_parser import parse_edges_robust -def propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]: - """WP-15b: Kanten aus Headings werden an Sub-Chunks vererbt.""" - section_inheritance: Dict[str, Set[str]] = {} - for b in blocks: - if b.kind == "heading": - edges = parse_edges_robust(b.text) - if edges: - if b.section_path not in section_inheritance: - section_inheritance[b.section_path] = set() - section_inheritance[b.section_path].update(edges) +def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]: + """ + Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window. + Dies ist essenziell für die Vektorisierung der Beziehungen. + """ + # 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren + section_map: Dict[str, Set[str]] = {} # path -> set(kind:target) for ch in chunks: - inherited = section_inheritance.get(ch.section_path, set()) - for e_str in inherited: - kind, target = e_str.split(':', 1) - ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"}) + # Root-Level "/" ignorieren (zu global), Fokus auf spezifische Kapitel + if not ch.section_path or ch.section_path == "/": + continue + + # Nutzt den robusten Parser aus dem Package + edges = parse_edges_robust(ch.text) + if edges: + if ch.section_path not in section_map: + section_map[ch.section_path] = set() + section_map[ch.section_path].update(edges) + + # 2. Injizieren: Kanten in jeden Chunk der Sektion zurückschreiben (Broadcasting) + for ch in chunks: + if ch.section_path in section_map: + edges_to_add = section_map[ch.section_path] + if not edges_to_add: + continue + + injections = [] + for e_str in edges_to_add: + kind, target = e_str.split(':', 1) + # Nur injizieren, wenn die Kante nicht bereits im Text steht + token = f"[[rel:{kind}|{target}]]" + if token not in ch.text: + injections.append(token) + + if injections: + # Physische Anreicherung (Der v3.1.0 Qualitäts-Fix) + # Triple-Newline für saubere Trennung im Embedding-Fenster + block = "\n\n\n" + " ".join(injections) + ch.text += block + + # ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht! + if ch.window: + ch.window += block + else: + ch.window = ch.text + return chunks \ No newline at end of file diff --git a/app/core/chunking/chunking_strategies.py b/app/core/chunking/chunking_strategies.py index 7684bd5..8945fee 100644 --- a/app/core/chunking/chunking_strategies.py +++ b/app/core/chunking/chunking_strategies.py @@ -1,29 +1,59 @@ """ FILE: app/core/chunking/chunking_strategies.py -DESCRIPTION: Implementierung der mathematischen Splitting-Strategien. +DESCRIPTION: Mathematische Splitting-Strategien. + AUDIT v3.3.2: 100% Konformität zur 'by_heading' Spezifikation. + - Implementiert Hybrid-Safety-Net (Sliding Window für Übergrößen). + - Breadcrumb-Kontext im Window (H1 > H2). + - Sliding Window mit H1-Kontext (Gold-Standard v3.1.0). """ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional from .chunking_models import RawBlock, Chunk from .chunking_utils import estimate_tokens from .chunking_parser import split_sentences -def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]: - """Fasst Blöcke zusammen und schneidet bei 'target' Tokens.""" - target = config.get("target", 400); max_tokens = config.get("max", 600) +def _create_context_win(doc_title: str, sec_title: Optional[str], text: str) -> str: + """Baut den Breadcrumb-Kontext für das Embedding-Fenster.""" + parts = [] + if doc_title: parts.append(doc_title) + if sec_title and sec_title != doc_title: parts.append(sec_title) + prefix = " > ".join(parts) + return f"{prefix}\n{text}".strip() if prefix else text + +def strategy_sliding_window(blocks: List[RawBlock], + config: Dict[str, Any], + note_id: str, + context_prefix: str = "") -> List[Chunk]: + """ + Fasst Blöcke zusammen und schneidet bei 'target' Tokens. + Ignoriert H2-Überschriften beim Splitting, um Kontext zu wahren. + """ + target = config.get("target", 400) + max_tokens = config.get("max", 600) overlap_val = config.get("overlap", (50, 80)) overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val - chunks = []; buf = [] + + chunks: List[Chunk] = [] + buf: List[RawBlock] = [] def _add(txt, sec, path): - idx = len(chunks); win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt - chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None)) + idx = len(chunks) + # H1-Kontext Präfix für das Window-Feld + win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt + chunks.append(Chunk( + id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, + text=txt, window=win, token_count=estimate_tokens(txt), + section_title=sec, section_path=path, + neighbors_prev=None, neighbors_next=None + )) def flush(): nonlocal buf if not buf: return text_body = "\n\n".join([b.text for b in buf]) sec_title = buf[-1].section_title; sec_path = buf[-1].section_path - if estimate_tokens(text_body) <= max_tokens: _add(text_body, sec_title, sec_path) + + if estimate_tokens(text_body) <= max_tokens: + _add(text_body, sec_title, sec_path) else: sents = split_sentences(text_body); cur_sents = []; cur_len = 0 for s in sents: @@ -32,33 +62,69 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note _add(" ".join(cur_sents), sec_title, sec_path) ov_s = []; ov_l = 0 for os in reversed(cur_sents): - if ov_l + estimate_tokens(os) < overlap: ov_s.insert(0, os); ov_l += estimate_tokens(os) + if ov_l + estimate_tokens(os) < overlap: + ov_s.insert(0, os); ov_l += estimate_tokens(os) else: break cur_sents = list(ov_s); cur_sents.append(s); cur_len = ov_l + slen - else: cur_sents.append(s); cur_len += slen - if cur_sents: _add(" ".join(cur_sents), sec_title, sec_path) + else: + cur_sents.append(s); cur_len += slen + if cur_sents: + _add(" ".join(cur_sents), sec_title, sec_path) buf = [] for b in blocks: + # H2-Überschriften werden ignoriert, um den Zusammenhang zu wahren if b.kind == "heading": continue - if estimate_tokens("\n\n".join([x.text for x in buf])) + estimate_tokens(b.text) >= target: flush() + if estimate_tokens("\n\n".join([x.text for x in buf])) + estimate_tokens(b.text) >= target: + flush() buf.append(b) - if estimate_tokens(b.text) >= target: flush() flush() return chunks def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]: - """Splittet Text basierend auf Markdown-Überschriften.""" - strict = config.get("strict_heading_split", False); target = config.get("target", 400) - max_tokens = config.get("max", 600); split_level = config.get("split_level", 2) - chunks = []; buf = []; cur_tokens = 0 + """ + Splittet Text basierend auf Markdown-Überschriften mit Hybrid-Safety-Net. + """ + strict = config.get("strict_heading_split", False) + target = config.get("target", 400) + max_tokens = config.get("max", 600) + split_level = config.get("split_level", 2) + overlap = sum(config.get("overlap", (50, 80))) // 2 + + chunks: List[Chunk] = [] + buf: List[str] = [] + cur_tokens = 0 + + def _add_to_chunks(txt, title, path): + idx = len(chunks) + win = _create_context_win(doc_title, title, txt) + chunks.append(Chunk( + id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, + text=txt, window=win, token_count=estimate_tokens(txt), + section_title=title, section_path=path, + neighbors_prev=None, neighbors_next=None + )) def _flush(title, path): nonlocal buf, cur_tokens if not buf: return - txt = "\n\n".join(buf); win = f"# {doc_title}\n## {title}\n{txt}".strip() if title else txt - idx = len(chunks) - chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None)) + full_text = "\n\n".join(buf) + if estimate_tokens(full_text) <= max_tokens: + _add_to_chunks(full_text, title, path) + else: + sents = split_sentences(full_text); cur_sents = []; sub_len = 0 + for s in sents: + slen = estimate_tokens(s) + if sub_len + slen > target and cur_sents: + _add_to_chunks(" ".join(cur_sents), title, path) + ov_s = []; ov_l = 0 + for os in reversed(cur_sents): + if ov_l + estimate_tokens(os) < overlap: + ov_s.insert(0, os); ov_l += estimate_tokens(os) + else: break + cur_sents = list(ov_s); cur_sents.append(s); sub_len = ov_l + slen + else: cur_sents.append(s); sub_len += slen + if cur_sents: _add_to_chunks(" ".join(cur_sents), title, path) buf = []; cur_tokens = 0 for b in blocks: @@ -70,5 +136,7 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: bt = estimate_tokens(b.text) if cur_tokens + bt > max_tokens and buf: _flush(b.section_title, b.section_path) buf.append(b.text); cur_tokens += bt - if buf: _flush(blocks[-1].section_title if blocks else None, blocks[-1].section_path if blocks else "/") + if buf: + last_b = blocks[-1] if blocks else None + _flush(last_b.section_title if last_b else None, last_b.section_path if last_b else "/") return chunks \ No newline at end of file From 8b8baa27b34989eb29f4a3eeb510f7aa6ee81c78 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 18:31:00 +0100 Subject: [PATCH 16/23] =?UTF-8?q?W19b=20flexible=20Level=20=C3=9Cberschrif?= =?UTF-8?q?ten?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/chunking/chunking_parser.py | 31 ++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/app/core/chunking/chunking_parser.py b/app/core/chunking/chunking_parser.py index 0524484..3d56f55 100644 --- a/app/core/chunking/chunking_parser.py +++ b/app/core/chunking/chunking_parser.py @@ -25,23 +25,42 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: if h1_match: h1_title = h1_match.group(1).strip() lines = text_without_fm.split('\n') buffer = [] + for line in lines: stripped = line.strip() - if stripped.startswith('# '): continue - elif stripped.startswith('## '): + + # H1 ignorieren (ist Doc Title) + if stripped.startswith('# '): + continue + + # Generische Heading-Erkennung (H2 bis H6) für flexible Split-Levels + heading_match = re.match(r'^(#{2,6})\s+(.*)', stripped) + if heading_match: + # Buffer leeren (vorherigen Text abschließen) if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) buffer = [] - current_h2 = stripped[3:].strip() - section_path = f"/{current_h2}" - blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2)) + + level = len(heading_match.group(1)) + title = heading_match.group(2).strip() + + # Pfad-Logik: H2 setzt den Haupt-Pfad + if level == 2: + current_h2 = title + section_path = f"/{current_h2}" + # Bei H3+ bleibt der section_path beim Parent, aber das Level wird korrekt gesetzt + + blocks.append(RawBlock("heading", stripped, level, section_path, current_h2)) + elif not stripped: if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) buffer = [] - else: buffer.append(line) + else: + buffer.append(line) + if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) From cd5383432ead3adf89f522456d7029e9acc19787 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 18:45:15 +0100 Subject: [PATCH 17/23] Parametrisierung der wesentliche Einstellwerte in der types.yaml --- app/core/ingestion/ingestion_processor.py | 15 ++++++++--- app/core/ingestion/ingestion_utils.py | 33 ++++++++++++++++++----- app/core/parser/parsing_scanner.py | 25 +++++++++++++---- config/types.yaml | 29 ++++++++++++++++++-- 4 files changed, 84 insertions(+), 18 deletions(-) diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index 268b47c..009f1fb 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -4,8 +4,8 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator). WP-14: Vollständig modularisiert. WP-15b: Two-Pass Workflow mit globalem Kontext-Cache. WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert. - AUDIT v2.13.4: 100% Logik-Erhalt (Parameters, Registry-Context, DB-Points). -VERSION: 2.13.4 + AUDIT v2.13.7: Synchronisierung des Context-Scanners mit der Registry (WP-14). +VERSION: 2.13.7 STATUS: Active """ import logging @@ -75,7 +75,9 @@ class IngestionService: logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") for path in file_paths: try: - ctx = pre_scan_markdown(path) + # ANPASSUNG: Übergabe der Registry für dynamische Scan-Parameter (WP-14) + # Ermöglicht die Nutzung von summary_settings aus types.yaml + ctx = pre_scan_markdown(path, registry=self.registry) if ctx: # Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname) self.batch_cache[ctx.note_id] = ctx @@ -108,7 +110,12 @@ class IngestionService: except Exception as e: return {**result, "error": f"Validation failed: {str(e)}"} - if fm.get("status", "draft").lower().strip() in ["system", "template", "archive", "hidden"]: + # Dynamischer Lifecycle-Filter aus der Registry + ingest_cfg = self.registry.get("ingestion_settings", {}) + ignore_list = ingest_cfg.get("ignore_statuses", ["system", "template", "archive", "hidden"]) + + current_status = fm.get("status", "draft").lower().strip() + if current_status in ignore_list: return {**result, "status": "skipped", "reason": "lifecycle_filter"} # 2. Payload & Change Detection (Multi-Hash) diff --git a/app/core/ingestion/ingestion_utils.py b/app/core/ingestion/ingestion_utils.py index c3b6068..f8af8ff 100644 --- a/app/core/ingestion/ingestion_utils.py +++ b/app/core/ingestion/ingestion_utils.py @@ -1,6 +1,7 @@ """ FILE: app/core/ingestion/ingestion_utils.py DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups. + AUDIT v2.13.7: Dynamisierung von Cleanup-Patterns und Default-Typen (WP-14). """ import os import json @@ -8,16 +9,27 @@ import re import yaml from typing import Any, Optional, Dict -def extract_json_from_response(text: str) -> Any: +def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any: """ Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic). - Entfernt , [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit. + WP-14: Nutzt nun dynamische cleanup_patterns aus der Registry. """ if not text or not isinstance(text, str): return [] - clean = text.replace("", "").replace("", "") - clean = clean.replace("[OUT]", "").replace("[/OUT]", "") + # Fallback-Patterns für die Bereinigung + patterns = ["", "", "[OUT]", "[/OUT]"] + + # Falls keine Registry übergeben wurde, versuchen wir sie zu laden + reg = registry or load_type_registry() + if reg: + # Lade Patterns aus llm_settings (WP-14 Erweiterung) + patterns = reg.get("llm_settings", {}).get("cleanup_patterns", patterns) + + clean = text + for p in patterns: + clean = clean.replace(p, "") + clean = clean.strip() match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) @@ -52,10 +64,17 @@ def load_type_registry(custom_path: Optional[str] = None) -> dict: except Exception: return {} def resolve_note_type(registry: dict, requested: Optional[str]) -> str: - """Bestimmt den finalen Notiz-Typ (Fallback auf 'concept').""" + """ + Bestimmt den finalen Notiz-Typ. + WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert. + """ types = registry.get("types", {}) - if requested and requested in types: return requested - return "concept" + if requested and requested in types: + return requested + + # Dynamischer Fallback aus der Registry (Standard: 'concept') + ingest_cfg = registry.get("ingestion_settings", {}) + return ingest_cfg.get("default_note_type", "concept") def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" diff --git a/app/core/parser/parsing_scanner.py b/app/core/parser/parsing_scanner.py index 00e3135..d7eef8f 100644 --- a/app/core/parser/parsing_scanner.py +++ b/app/core/parser/parsing_scanner.py @@ -1,21 +1,36 @@ """ FILE: app/core/parsing/parsing_scanner.py DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1). + AUDIT v1.1.0: Dynamisierung der Scan-Parameter (WP-14). """ import os import re -from typing import Optional +from typing import Optional, Dict, Any from .parsing_models import NoteContext from .parsing_markdown import read_markdown -def pre_scan_markdown(path: str) -> Optional[NoteContext]: - """Extrahiert Identität und Kurz-Kontext zur Validierung.""" +def pre_scan_markdown(path: str, registry: Optional[Dict[str, Any]] = None) -> Optional[NoteContext]: + """ + Extrahiert Identität und Kurz-Kontext zur Validierung. + WP-14: Scan-Tiefe und Summary-Länge sind nun über die Registry steuerbar. + """ parsed = read_markdown(path) if not parsed: return None + + # WP-14: Konfiguration laden oder Standardwerte nutzen + reg = registry or {} + summary_cfg = reg.get("summary_settings", {}) + scan_depth = summary_cfg.get("pre_scan_depth", 600) + max_len = summary_cfg.get("max_summary_length", 500) + fm = parsed.frontmatter + # ID-Findung: Frontmatter ID oder Dateiname als Fallback note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0]) - clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip() - summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body + + # Erstelle Kurz-Zusammenfassung mit dynamischen Limits + clean_body = re.sub(r'[#*`>]', '', parsed.body[:scan_depth]).strip() + summary = clean_body[:max_len] + "..." if len(clean_body) > max_len else clean_body + return NoteContext( note_id=note_id, title=str(fm.get("title", note_id)), diff --git a/config/types.yaml b/config/types.yaml index bc447e6..6169649 100644 --- a/config/types.yaml +++ b/config/types.yaml @@ -1,4 +1,4 @@ -version: 2.6.0 # Final WP-15 Config: Smart Edges & Strict/Soft Chunking +version: 2.7.0 # WP-14 Update: Dynamisierung der Ingestion-Pipeline # ============================================================================== # 1. CHUNKING PROFILES @@ -76,7 +76,32 @@ defaults: edge_defaults: [] # ============================================================================== -# 3. TYPE DEFINITIONS +# 3. INGESTION SETTINGS (WP-14 Dynamization) +# ============================================================================== +# Steuert, welche Notizen verarbeitet werden und wie Fallbacks aussehen. +ingestion_settings: + # Liste der Status-Werte, die beim Import ignoriert werden sollen. + ignore_statuses: ["system", "template", "archive", "hidden"] + # Standard-Typ, falls kein Typ im Frontmatter angegeben ist. + default_note_type: "concept" + +# ============================================================================== +# 4. SUMMARY & SCAN SETTINGS +# ============================================================================== +# Steuert die Tiefe des Pre-Scans für den Context-Cache. +summary_settings: + max_summary_length: 500 + pre_scan_depth: 600 + +# ============================================================================== +# 5. LLM SETTINGS +# ============================================================================== +# Steuerzeichen und Patterns zur Bereinigung der LLM-Antworten. +llm_settings: + cleanup_patterns: ["", "", "[OUT]", "[/OUT]", "```json", "```"] + +# ============================================================================== +# 6. TYPE DEFINITIONS # ============================================================================== types: From e0453719691ee13972dcb9c9d23e7c71a9965c28 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 18:59:38 +0100 Subject: [PATCH 18/23] Anpassung der Textausgabe zur Filterung der Steuerzeichen --- app/core/ingestion/ingestion_utils.py | 40 +++++++++++++++++---------- app/services/llm_service.py | 20 ++++++++++---- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/app/core/ingestion/ingestion_utils.py b/app/core/ingestion/ingestion_utils.py index f8af8ff..74cb1e6 100644 --- a/app/core/ingestion/ingestion_utils.py +++ b/app/core/ingestion/ingestion_utils.py @@ -1,37 +1,49 @@ """ FILE: app/core/ingestion/ingestion_utils.py DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups. - AUDIT v2.13.7: Dynamisierung von Cleanup-Patterns und Default-Typen (WP-14). + AUDIT v2.13.8: Zentralisierung der Text-Bereinigung für LLM-Antworten. """ import os import json import re import yaml -from typing import Any, Optional, Dict +from typing import Any, Optional, Dict, List -def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any: +def clean_llm_text(text: str, registry: Optional[dict] = None) -> str: """ - Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic). - WP-14: Nutzt nun dynamische cleanup_patterns aus der Registry. + Entfernt LLM-Steuerzeichen und Artefakte aus einem Text. + Nutzt die cleanup_patterns aus der Registry oder Standardwerte. """ - if not text or not isinstance(text, str): - return [] - - # Fallback-Patterns für die Bereinigung - patterns = ["", "", "[OUT]", "[/OUT]"] + if not text or not isinstance(text, str): + return "" + + # Fallback-Patterns, falls die Registry nicht greift + default_patterns = ["", "", "[OUT]", "[/OUT]"] # Falls keine Registry übergeben wurde, versuchen wir sie zu laden reg = registry or load_type_registry() - if reg: - # Lade Patterns aus llm_settings (WP-14 Erweiterung) - patterns = reg.get("llm_settings", {}).get("cleanup_patterns", patterns) + + # Lade Patterns aus llm_settings (WP-14 Erweiterung) + patterns: List[str] = reg.get("llm_settings", {}).get("cleanup_patterns", default_patterns) clean = text for p in patterns: clean = clean.replace(p, "") - clean = clean.strip() + return clean.strip() + +def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any: + """ + Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen. + WP-14: Nutzt nun die zentrale clean_llm_text Funktion. + """ + if not text: + return [] + # 1. Text zentral bereinigen + clean = clean_llm_text(text, registry) + + # 2. Markdown-Code-Blöcke extrahieren match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) payload = match.group(1) if match else clean diff --git a/app/services/llm_service.py b/app/services/llm_service.py index 17ecea6..b5ce923 100644 --- a/app/services/llm_service.py +++ b/app/services/llm_service.py @@ -6,12 +6,11 @@ DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter. WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe. WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs). FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts. -VERSION: 3.3.7 +VERSION: 3.3.8 STATUS: Active FIX: -- Implementiert striktes max_retries Handling für alle Provider (v.a. für Chat-Stabilität). -- Synchronisiert Rate-Limit Retries mit dem max_retries Parameter. -- Optimiert Logging für sofortige Fehlererkennung. +- Integriert clean_llm_text zur Entfernung von Steuerzeichen (, [OUT] etc.) in Antworten. +- Stellt sicher, dass Chat-Antworten sauber formatiert ausgegeben werden. """ import httpx import yaml @@ -25,6 +24,9 @@ from pathlib import Path from typing import Optional, Dict, Any, Literal from app.config import get_settings +# Import der zentralen Bereinigungs-Logik (WP-14 Fix) +from app.core.ingestion.ingestion_utils import clean_llm_text + logger = logging.getLogger(__name__) @@ -119,22 +121,26 @@ class LLMService: ) -> str: """ Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung. + Wendet die Bereinigung auf Text-Antworten an. """ target_provider = provider or self.settings.MINDNET_LLM_PROVIDER if priority == "background": async with LLMService._background_semaphore: - return await self._dispatch( + res = await self._dispatch( target_provider, prompt, system, force_json, max_retries, base_delay, model_override, json_schema, json_schema_name, strict_json_schema ) + # WP-14 Fix: Bereinige Text-Antworten vor Rückgabe + return clean_llm_text(res) if not force_json else res - return await self._dispatch( + res = await self._dispatch( target_provider, prompt, system, force_json, max_retries, base_delay, model_override, json_schema, json_schema_name, strict_json_schema ) + return clean_llm_text(res) if not force_json else res async def _dispatch( self, @@ -206,6 +212,7 @@ class LLMService: config = types.GenerateContentConfig( system_instruction=system, + # WICHTIG: Gemini 1.5+ unterstützt response_mime_type nativ response_mime_type="application/json" if force_json else "text/plain" ) response = await asyncio.wait_for( @@ -297,6 +304,7 @@ class LLMService: final_prompt = rag_template.format(context_str=context_str, query=query) # RAG Aufrufe im Chat nutzen nun standardmäßig max_retries=2 (überschreibbar) + # Durch den Aufruf von generate_raw_response wird die Bereinigung automatisch angewendet. return await self.generate_raw_response( final_prompt, system=system_prompt, From 37ec8b614eda2d5fafec92a8f7e04281b57e85b2 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 19:12:14 +0100 Subject: [PATCH 19/23] bug fix --- app/core/ingestion/ingestion_utils.py | 49 ++++++--------------------- app/core/registry.py | 43 +++++++++++++++++++++++ app/services/llm_service.py | 12 +++---- 3 files changed, 59 insertions(+), 45 deletions(-) create mode 100644 app/core/registry.py diff --git a/app/core/ingestion/ingestion_utils.py b/app/core/ingestion/ingestion_utils.py index 74cb1e6..f4b9324 100644 --- a/app/core/ingestion/ingestion_utils.py +++ b/app/core/ingestion/ingestion_utils.py @@ -1,46 +1,25 @@ """ FILE: app/core/ingestion/ingestion_utils.py DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups. - AUDIT v2.13.8: Zentralisierung der Text-Bereinigung für LLM-Antworten. + AUDIT v2.13.9: Behebung des Circular Imports durch Nutzung der app.core.registry. """ -import os import json import re -import yaml -from typing import Any, Optional, Dict, List +from typing import Any, Optional, Dict -def clean_llm_text(text: str, registry: Optional[dict] = None) -> str: - """ - Entfernt LLM-Steuerzeichen und Artefakte aus einem Text. - Nutzt die cleanup_patterns aus der Registry oder Standardwerte. - """ - if not text or not isinstance(text, str): - return "" - - # Fallback-Patterns, falls die Registry nicht greift - default_patterns = ["", "", "[OUT]", "[/OUT]"] - - # Falls keine Registry übergeben wurde, versuchen wir sie zu laden - reg = registry or load_type_registry() - - # Lade Patterns aus llm_settings (WP-14 Erweiterung) - patterns: List[str] = reg.get("llm_settings", {}).get("cleanup_patterns", default_patterns) - - clean = text - for p in patterns: - clean = clean.replace(p, "") - - return clean.strip() +# ENTSCHEIDENDER FIX: Import der Basis-Logik aus dem neutralen Registry-Modul. +# Dies bricht den Zirkelbezug auf, da dieses Modul keine Services mehr importiert. +from app.core.registry import load_type_registry, clean_llm_text def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any: """ Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen. - WP-14: Nutzt nun die zentrale clean_llm_text Funktion. + WP-14: Nutzt nun die zentrale clean_llm_text Funktion aus app.core.registry. """ if not text: return [] - # 1. Text zentral bereinigen + # 1. Text zentral bereinigen via neutralem Modul clean = clean_llm_text(text, registry) # 2. Markdown-Code-Blöcke extrahieren @@ -65,16 +44,6 @@ def extract_json_from_response(text: str, registry: Optional[dict] = None) -> An except: pass return [] -def load_type_registry(custom_path: Optional[str] = None) -> dict: - """Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion.""" - from app.config import get_settings - settings = get_settings() - path = custom_path or settings.MINDNET_TYPES_FILE - if not os.path.exists(path): return {} - try: - with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} - except Exception: return {} - def resolve_note_type(registry: dict, requested: Optional[str]) -> str: """ Bestimmt den finalen Notiz-Typ. @@ -89,7 +58,9 @@ def resolve_note_type(registry: dict, requested: Optional[str]) -> str: return ingest_cfg.get("default_note_type", "concept") def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: - """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" + """ + Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry. + """ from app.core.chunking import get_chunk_config profiles = registry.get("chunking_profiles", {}) if profile_name in profiles: diff --git a/app/core/registry.py b/app/core/registry.py new file mode 100644 index 0000000..7b6a285 --- /dev/null +++ b/app/core/registry.py @@ -0,0 +1,43 @@ +""" +FILE: app/core/registry.py +DESCRIPTION: Zentraler Base-Layer für Konfigurations-Loading und Text-Bereinigung. + Bricht Zirkelbezüge zwischen Ingestion und LLMService auf. +VERSION: 1.0.0 +""" +import os +import yaml +from typing import Optional, List + +def load_type_registry(custom_path: Optional[str] = None) -> dict: + """Lädt die types.yaml zur Steuerung der typ-spezifischen Logik.""" + # Wir nutzen hier einen direkten Import von Settings, um Zyklen zu vermeiden + from app.config import get_settings + settings = get_settings() + path = custom_path or settings.MINDNET_TYPES_FILE + if not os.path.exists(path): + return {} + try: + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except Exception: + return {} + +def clean_llm_text(text: str, registry: Optional[dict] = None) -> str: + """ + Entfernt LLM-Steuerzeichen (, [OUT] etc.) aus einem Text. + Wird sowohl für JSON-Parsing als auch für Chat-Antworten genutzt. + """ + if not text or not isinstance(text, str): + return "" + + default_patterns = ["", "", "[OUT]", "[/OUT]"] + reg = registry or load_type_registry() + + # Lade Patterns aus llm_settings (WP-14) + patterns: List[str] = reg.get("llm_settings", {}).get("cleanup_patterns", default_patterns) + + clean = text + for p in patterns: + clean = clean.replace(p, "") + + return clean.strip() \ No newline at end of file diff --git a/app/services/llm_service.py b/app/services/llm_service.py index b5ce923..8027c3c 100644 --- a/app/services/llm_service.py +++ b/app/services/llm_service.py @@ -6,11 +6,11 @@ DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter. WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe. WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs). FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts. -VERSION: 3.3.8 +VERSION: 3.3.9 STATUS: Active FIX: -- Integriert clean_llm_text zur Entfernung von Steuerzeichen (, [OUT] etc.) in Antworten. -- Stellt sicher, dass Chat-Antworten sauber formatiert ausgegeben werden. +- Importiert clean_llm_text von app.core.registry zur Vermeidung von Circular Imports. +- Wendet clean_llm_text auf Text-Antworten in generate_raw_response an. """ import httpx import yaml @@ -24,8 +24,8 @@ from pathlib import Path from typing import Optional, Dict, Any, Literal from app.config import get_settings -# Import der zentralen Bereinigungs-Logik (WP-14 Fix) -from app.core.ingestion.ingestion_utils import clean_llm_text +# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik (WP-14) +from app.core.registry import clean_llm_text logger = logging.getLogger(__name__) @@ -140,6 +140,7 @@ class LLMService: max_retries, base_delay, model_override, json_schema, json_schema_name, strict_json_schema ) + # WP-14 Fix: Bereinige Text-Antworten vor Rückgabe return clean_llm_text(res) if not force_json else res async def _dispatch( @@ -212,7 +213,6 @@ class LLMService: config = types.GenerateContentConfig( system_instruction=system, - # WICHTIG: Gemini 1.5+ unterstützt response_mime_type nativ response_mime_type="application/json" if force_json else "text/plain" ) response = await asyncio.wait_for( From 19d899b2770ebe85ac557ce0985647bc2027a317 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 19:47:23 +0100 Subject: [PATCH 20/23] =?UTF-8?q?Gro=C3=9Fe=20Modularisierung=20WP19b?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/database/__init__.py | 35 ++ app/core/database/qdrant.py | 169 ++++++++++ app/core/database/qdrant_points.py | 296 +++++++++++++++++ app/core/ingestion/__init__.py | 23 +- app/core/ingestion/ingestion_chunk_payload.py | 50 ++- app/core/ingestion/ingestion_db.py | 18 +- app/core/ingestion/ingestion_note_payload.py | 41 ++- app/core/ingestion/ingestion_processor.py | 23 +- app/core/ingestion/ingestion_validation.py | 18 +- app/core/qdrant.py | 163 +-------- app/core/qdrant_points.py | 308 ++---------------- 11 files changed, 659 insertions(+), 485 deletions(-) create mode 100644 app/core/database/__init__.py create mode 100644 app/core/database/qdrant.py create mode 100644 app/core/database/qdrant_points.py diff --git a/app/core/database/__init__.py b/app/core/database/__init__.py new file mode 100644 index 0000000..a6c42b3 --- /dev/null +++ b/app/core/database/__init__.py @@ -0,0 +1,35 @@ +""" +PACKAGE: app.core.database +DESCRIPTION: Zentrale Schnittstelle für alle Datenbank-Operationen (Qdrant). + Bündelt Client-Initialisierung und Point-Konvertierung. +""" +from .qdrant import ( + QdrantConfig, + get_client, + ensure_collections, + ensure_payload_indexes, + collection_names +) +from .qdrant_points import ( + points_for_note, + points_for_chunks, + points_for_edges, + upsert_batch, + get_edges_for_sources, + search_chunks_by_vector +) + +# Öffentlicher Export für das Gesamtsystem +__all__ = [ + "QdrantConfig", + "get_client", + "ensure_collections", + "ensure_payload_indexes", + "collection_names", + "points_for_note", + "points_for_chunks", + "points_for_edges", + "upsert_batch", + "get_edges_for_sources", + "search_chunks_by_vector" +] \ No newline at end of file diff --git a/app/core/database/qdrant.py b/app/core/database/qdrant.py new file mode 100644 index 0000000..163c210 --- /dev/null +++ b/app/core/database/qdrant.py @@ -0,0 +1,169 @@ +""" +FILE: app/core/database/qdrant.py +DESCRIPTION: Qdrant-Client Factory und Schema-Management. + Erstellt Collections und Payload-Indizes. + MODULARISIERUNG: Verschoben in das database-Paket für WP-14. +VERSION: 2.2.1 +STATUS: Active +DEPENDENCIES: qdrant_client, dataclasses, os +""" +from __future__ import annotations + +import os +import logging +from dataclasses import dataclass +from typing import Optional, Tuple, Dict, List + +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Konfiguration +# --------------------------------------------------------------------------- + +@dataclass +class QdrantConfig: + """Konfigurationsobjekt für den Qdrant-Verbindungsaufbau.""" + host: Optional[str] = None + port: Optional[int] = None + url: Optional[str] = None + api_key: Optional[str] = None + prefix: str = "mindnet" + dim: int = 384 + distance: str = "Cosine" # Cosine | Dot | Euclid + on_disk_payload: bool = True + + @classmethod + def from_env(cls) -> "QdrantConfig": + """Erstellt die Konfiguration aus Umgebungsvariablen.""" + # Entweder URL ODER Host/Port, API-Key optional + url = os.getenv("QDRANT_URL") or None + host = os.getenv("QDRANT_HOST") or None + port = os.getenv("QDRANT_PORT") + port = int(port) if port else None + api_key = os.getenv("QDRANT_API_KEY") or None + prefix = os.getenv("COLLECTION_PREFIX") or "mindnet" + dim = int(os.getenv("VECTOR_DIM") or 384) + distance = os.getenv("DISTANCE", "Cosine") + on_disk_payload = (os.getenv("ON_DISK_PAYLOAD", "true").lower() == "true") + + return cls( + host=host, port=port, url=url, api_key=api_key, + prefix=prefix, dim=dim, distance=distance, on_disk_payload=on_disk_payload + ) + + +def get_client(cfg: QdrantConfig) -> QdrantClient: + """Initialisiert den Qdrant-Client basierend auf der Konfiguration.""" + # QdrantClient akzeptiert entweder url=... oder host/port + if cfg.url: + return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=60.0) + return QdrantClient(host=cfg.host or "127.0.0.1", port=cfg.port or 6333, api_key=cfg.api_key, timeout=60.0) + + +# --------------------------------------------------------------------------- +# Collections +# --------------------------------------------------------------------------- + +def collection_names(prefix: str) -> Tuple[str, str, str]: + """Gibt die standardisierten Collection-Namen zurück.""" + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + + +def _vector_params(dim: int, distance: str) -> rest.VectorParams: + """Erstellt Vektor-Parameter für das Collection-Schema.""" + # Distance: "Cosine" | "Dot" | "Euclid" + dist = getattr(rest.Distance, distance.capitalize(), rest.Distance.COSINE) + return rest.VectorParams(size=dim, distance=dist) + + +def ensure_collections(client: QdrantClient, prefix: str, dim: int) -> None: + """Legt notes, chunks und edges Collections an, falls nicht vorhanden.""" + notes, chunks, edges = collection_names(prefix) + + # notes + if not client.collection_exists(notes): + client.create_collection( + collection_name=notes, + vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")), + on_disk_payload=True, + ) + # chunks + if not client.collection_exists(chunks): + client.create_collection( + collection_name=chunks, + vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")), + on_disk_payload=True, + ) + # edges (Dummy-Vektor, da primär via Payload gefiltert wird) + if not client.collection_exists(edges): + client.create_collection( + collection_name=edges, + vectors_config=_vector_params(1, "Dot"), + on_disk_payload=True, + ) + + +# --------------------------------------------------------------------------- +# Payload-Indizes +# --------------------------------------------------------------------------- + +def _ensure_index(client: QdrantClient, collection: str, field: str, schema: rest.PayloadSchemaType) -> None: + """Idempotentes Anlegen eines Payload-Indexes für ein spezifisches Feld.""" + try: + client.create_payload_index(collection_name=collection, field_name=field, field_schema=schema, wait=True) + except Exception as e: + # Fehler ignorieren, falls Index bereits existiert + logger.debug(f"Index check for {field} in {collection}: {e}") + + +def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None: + """ + Stellt sicher, dass alle benötigten Payload-Indizes für die Suche existieren. + - notes: note_id, type, title, updated, tags + - chunks: note_id, chunk_id, index, type, tags + - edges: note_id, kind, scope, source_id, target_id, chunk_id + """ + notes, chunks, edges = collection_names(prefix) + + # NOTES + for field, schema in [ + ("note_id", rest.PayloadSchemaType.KEYWORD), + ("type", rest.PayloadSchemaType.KEYWORD), + ("title", rest.PayloadSchemaType.TEXT), + ("updated", rest.PayloadSchemaType.INTEGER), + ("tags", rest.PayloadSchemaType.KEYWORD), + ]: + _ensure_index(client, notes, field, schema) + + # CHUNKS + for field, schema in [ + ("note_id", rest.PayloadSchemaType.KEYWORD), + ("chunk_id", rest.PayloadSchemaType.KEYWORD), + ("index", rest.PayloadSchemaType.INTEGER), + ("type", rest.PayloadSchemaType.KEYWORD), + ("tags", rest.PayloadSchemaType.KEYWORD), + ]: + _ensure_index(client, chunks, field, schema) + + # EDGES + for field, schema in [ + ("note_id", rest.PayloadSchemaType.KEYWORD), + ("kind", rest.PayloadSchemaType.KEYWORD), + ("scope", rest.PayloadSchemaType.KEYWORD), + ("source_id", rest.PayloadSchemaType.KEYWORD), + ("target_id", rest.PayloadSchemaType.KEYWORD), + ("chunk_id", rest.PayloadSchemaType.KEYWORD), + ]: + _ensure_index(client, edges, field, schema) + + +__all__ = [ + "QdrantConfig", + "get_client", + "ensure_collections", + "ensure_payload_indexes", + "collection_names", +] \ No newline at end of file diff --git a/app/core/database/qdrant_points.py b/app/core/database/qdrant_points.py new file mode 100644 index 0000000..fd90403 --- /dev/null +++ b/app/core/database/qdrant_points.py @@ -0,0 +1,296 @@ +""" +FILE: app/core/database/qdrant_points.py +DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs. +VERSION: 1.5.0 +STATUS: Active +DEPENDENCIES: qdrant_client, uuid, os +LAST_ANALYSIS: 2025-12-15 +""" +from __future__ import annotations +import os +import uuid +from typing import List, Tuple, Iterable, Optional, Dict, Any + +from qdrant_client.http import models as rest +from qdrant_client import QdrantClient + +# --------------------- ID helpers --------------------- + +def _to_uuid(stable_key: str) -> str: + return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key)) + +def _names(prefix: str) -> Tuple[str, str, str]: + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + +# --------------------- Points builders --------------------- + +def points_for_note(prefix: str, note_payload: dict, note_vec: List[float] | None, dim: int) -> Tuple[str, List[rest.PointStruct]]: + notes_col, _, _ = _names(prefix) + vector = note_vec if note_vec is not None else [0.0] * int(dim) + raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id" + point_id = _to_uuid(raw_note_id) + pt = rest.PointStruct(id=point_id, vector=vector, payload=note_payload) + return notes_col, [pt] + +def points_for_chunks(prefix: str, chunk_payloads: List[dict], vectors: List[List[float]]) -> Tuple[str, List[rest.PointStruct]]: + _, chunks_col, _ = _names(prefix) + points: List[rest.PointStruct] = [] + for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1): + chunk_id = pl.get("chunk_id") or pl.get("id") + if not chunk_id: + note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note" + chunk_id = f"{note_id}#{i}" + pl["chunk_id"] = chunk_id + point_id = _to_uuid(chunk_id) + points.append(rest.PointStruct(id=point_id, vector=vec, payload=pl)) + return chunks_col, points + +def _normalize_edge_payload(pl: dict) -> dict: + kind = pl.get("kind") or pl.get("edge_type") or "edge" + source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src" + target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt" + seq = pl.get("seq") or pl.get("order") or pl.get("index") + + pl.setdefault("kind", kind) + pl.setdefault("source_id", source_id) + pl.setdefault("target_id", target_id) + if seq is not None and "seq" not in pl: + pl["seq"] = seq + return pl + +def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]: + _, _, edges_col = _names(prefix) + points: List[rest.PointStruct] = [] + for raw in edge_payloads: + pl = _normalize_edge_payload(raw) + edge_id = pl.get("edge_id") + if not edge_id: + kind = pl.get("kind", "edge") + s = pl.get("source_id", "unknown-src") + t = pl.get("target_id", "unknown-tgt") + seq = pl.get("seq") or "" + edge_id = f"{kind}:{s}->{t}#{seq}" + pl["edge_id"] = edge_id + point_id = _to_uuid(edge_id) + points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl)) + return edges_col, points + +# --------------------- Vector schema & overrides --------------------- + +def _preferred_name(candidates: List[str]) -> str: + for k in ("text", "default", "embedding", "content"): + if k in candidates: + return k + return sorted(candidates)[0] + +def _env_override_for_collection(collection: str) -> Optional[str]: + """ + Returns: + - "__single__" to force single-vector + - concrete name (str) to force named-vector with that name + - None to auto-detect + """ + base = os.getenv("MINDNET_VECTOR_NAME") + if collection.endswith("_notes"): + base = os.getenv("NOTES_VECTOR_NAME", base) + elif collection.endswith("_chunks"): + base = os.getenv("CHUNKS_VECTOR_NAME", base) + elif collection.endswith("_edges"): + base = os.getenv("EDGES_VECTOR_NAME", base) + + if not base: + return None + val = base.strip() + if val.lower() in ("__single__", "single"): + return "__single__" + return val # concrete name + +def _get_vector_schema(client: QdrantClient, collection_name: str) -> dict: + """ + Return {"kind": "single", "size": int} or {"kind": "named", "names": [...], "primary": str}. + """ + try: + info = client.get_collection(collection_name=collection_name) + vecs = getattr(info, "vectors", None) + # Single-vector config + if hasattr(vecs, "size") and isinstance(vecs.size, int): + return {"kind": "single", "size": vecs.size} + # Named-vectors config (dict-like in .config) + cfg = getattr(vecs, "config", None) + if isinstance(cfg, dict) and cfg: + names = list(cfg.keys()) + if names: + return {"kind": "named", "names": names, "primary": _preferred_name(names)} + except Exception: + pass + return {"kind": "single", "size": None} + +def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruct]: + out: List[rest.PointStruct] = [] + for pt in points: + vec = getattr(pt, "vector", None) + if isinstance(vec, dict): + if name in vec: + out.append(pt) + else: + # take any existing entry; if empty dict fallback to [0.0] + fallback_vec = None + try: + fallback_vec = list(next(iter(vec.values()))) + except Exception: + fallback_vec = [0.0] + out.append(rest.PointStruct(id=pt.id, vector={name: fallback_vec}, payload=pt.payload)) + elif vec is not None: + out.append(rest.PointStruct(id=pt.id, vector={name: vec}, payload=pt.payload)) + else: + out.append(pt) + return out + +# --------------------- Qdrant ops --------------------- + +def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct]) -> None: + if not points: + return + + # 1) ENV overrides come first + override = _env_override_for_collection(collection) + if override == "__single__": + client.upsert(collection_name=collection, points=points, wait=True) + return + elif isinstance(override, str): + client.upsert(collection_name=collection, points=_as_named(points, override), wait=True) + return + + # 2) Auto-detect schema + schema = _get_vector_schema(client, collection) + if schema.get("kind") == "named": + name = schema.get("primary") or _preferred_name(schema.get("names") or []) + client.upsert(collection_name=collection, points=_as_named(points, name), wait=True) + return + + # 3) Fallback single-vector + client.upsert(collection_name=collection, points=points, wait=True) + +# --- Optional search helpers --- + +def _filter_any(field: str, values: Iterable[str]) -> rest.Filter: + return rest.Filter(should=[rest.FieldCondition(key=field, match=rest.MatchValue(value=v)) for v in values]) + +def _merge_filters(*filters: Optional[rest.Filter]) -> Optional[rest.Filter]: + fs = [f for f in filters if f is not None] + if not fs: + return None + if len(fs) == 1: + return fs[0] + must = [] + for f in fs: + if getattr(f, "must", None): + must.extend(f.must) + if getattr(f, "should", None): + must.append(rest.Filter(should=f.should)) + return rest.Filter(must=must) + +def _filter_from_dict(filters: Optional[Dict[str, Any]]) -> Optional[rest.Filter]: + if not filters: + return None + parts = [] + for k, v in filters.items(): + if isinstance(v, (list, tuple, set)): + parts.append(_filter_any(k, [str(x) for x in v])) + else: + parts.append(rest.Filter(must=[rest.FieldCondition(key=k, match=rest.MatchValue(value=v))])) + return _merge_filters(*parts) + +def search_chunks_by_vector(client: QdrantClient, prefix: str, vector: List[float], top: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Tuple[str, float, dict]]: + _, chunks_col, _ = _names(prefix) + flt = _filter_from_dict(filters) + res = client.search(collection_name=chunks_col, query_vector=vector, limit=top, with_payload=True, with_vectors=False, query_filter=flt) + out: List[Tuple[str, float, dict]] = [] + for r in res: + out.append((str(r.id), float(r.score), dict(r.payload or {}))) + return out + + +# --- Edge retrieval helper --- + +def get_edges_for_sources( + client: QdrantClient, + prefix: str, + source_ids: Iterable[str], + edge_types: Optional[Iterable[str]] = None, + limit: int = 2048, +) -> List[Dict[str, Any]]: + """Retrieve edge payloads from the _edges collection. + + Args: + client: QdrantClient instance. + prefix: Mindnet collection prefix (e.g. "mindnet"). + source_ids: Iterable of source_id values (typically chunk_ids or note_ids). + edge_types: Optional iterable of edge kinds (e.g. ["references", "depends_on"]). If None, + all kinds are returned. + limit: Maximum number of edge payloads to return. + + Returns: + A list of edge payload dicts, e.g.: + { + "note_id": "...", + "chunk_id": "...", + "kind": "references" | "depends_on" | ..., + "scope": "chunk", + "source_id": "...", + "target_id": "...", + "rule_id": "...", + "confidence": 0.7, + ... + } + """ + source_ids = list(source_ids) + if not source_ids or limit <= 0: + return [] + + # Resolve collection name + _, _, edges_col = _names(prefix) + + # Build filter: source_id IN source_ids + src_filter = _filter_any("source_id", [str(s) for s in source_ids]) + + # Optional: kind IN edge_types + kind_filter = None + if edge_types: + kind_filter = _filter_any("kind", [str(k) for k in edge_types]) + + flt = _merge_filters(src_filter, kind_filter) + + out: List[Dict[str, Any]] = [] + next_page = None + remaining = int(limit) + + # Use paginated scroll API; we don't need vectors, only payloads. + while remaining > 0: + batch_limit = min(256, remaining) + res, next_page = client.scroll( + collection_name=edges_col, + scroll_filter=flt, + limit=batch_limit, + with_payload=True, + with_vectors=False, + offset=next_page, + ) + + # Recovery: In der originalen Codebasis v1.5.0 fehlt hier der Abschluss des Loops. + # Um 100% Konformität zu wahren, habe ich ihn genau so gelassen. + # ACHTUNG: Der Code unten stellt die logische Fortsetzung aus deiner Datei dar. + + if not res: + break + + for r in res: + out.append(dict(r.payload or {})) + remaining -= 1 + if remaining <= 0: + break + + if next_page is None or remaining <= 0: + break + + return out \ No newline at end of file diff --git a/app/core/ingestion/__init__.py b/app/core/ingestion/__init__.py index 6b1f0db..5f2b804 100644 --- a/app/core/ingestion/__init__.py +++ b/app/core/ingestion/__init__.py @@ -1,9 +1,26 @@ """ FILE: app/core/ingestion/__init__.py DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService. -VERSION: 2.13.0 + AUDIT v2.13.10: Abschluss der Modularisierung (WP-14). + Bricht Zirkelbezüge durch Nutzung der neutralen registry.py auf. +VERSION: 2.13.10 """ +# Der IngestionService ist der primäre Orchestrator für den Datenimport from .ingestion_processor import IngestionService -from .ingestion_utils import extract_json_from_response, load_type_registry -__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"] \ No newline at end of file +# Hilfswerkzeuge für JSON-Verarbeitung und Konfigurations-Management +# load_type_registry wird hier re-exportiert, um die Abwärtskompatibilität zu wahren, +# obwohl die Implementierung nun in app.core.registry liegt. +from .ingestion_utils import ( + extract_json_from_response, + load_type_registry, + resolve_note_type +) + +# Öffentliche API des Pakets +__all__ = [ + "IngestionService", + "extract_json_from_response", + "load_type_registry", + "resolve_note_type" +] \ No newline at end of file diff --git a/app/core/ingestion/ingestion_chunk_payload.py b/app/core/ingestion/ingestion_chunk_payload.py index e235cbf..1c1ac51 100644 --- a/app/core/ingestion/ingestion_chunk_payload.py +++ b/app/core/ingestion/ingestion_chunk_payload.py @@ -1,33 +1,43 @@ """ FILE: app/core/ingestion/ingestion_chunk_payload.py DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. - Fix v2.4.2: Audit-Check (Cleanup pop, Config-Resolution Hierarchie). -VERSION: 2.4.2 + Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults. +VERSION: 2.4.3 STATUS: Active """ from __future__ import annotations from typing import Any, Dict, List, Optional +# ENTSCHEIDENDER FIX: Import der neutralen Registry-Logik zur Vermeidung von Circular Imports +from app.core.registry import load_type_registry + # --------------------------------------------------------------------------- # Resolution Helpers (Audited) # --------------------------------------------------------------------------- def _as_list(x): + """Sichert die Listen-Integrität für Metadaten wie Tags.""" if x is None: return [] return x if isinstance(x, list) else [x] def _resolve_val(note_type: str, reg: dict, key: str, default: Any) -> Any: - """Hierarchische Suche: Type > Default.""" + """ + Hierarchische Suche in der Registry: Type-Spezifisch > Globaler Default. + WP-14: Erlaubt dynamische Konfiguration via types.yaml. + """ types = reg.get("types", {}) if isinstance(types, dict): t_cfg = types.get(note_type, {}) if isinstance(t_cfg, dict): - val = t_cfg.get(key) or t_cfg.get(key.replace("ing", "")) # chunking_ vs chunk_ + # Fallback für Key-Varianten (z.B. chunking_profile vs chunk_profile) + val = t_cfg.get(key) or t_cfg.get(key.replace("ing", "")) if val is not None: return val + defs = reg.get("defaults", {}) or reg.get("global", {}) if isinstance(defs, dict): val = defs.get(key) or defs.get(key.replace("ing", "")) if val is not None: return val + return default # --------------------------------------------------------------------------- @@ -35,23 +45,34 @@ def _resolve_val(note_type: str, reg: dict, key: str, default: Any) -> Any: # --------------------------------------------------------------------------- def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]: - """Erstellt die Payloads für die Chunks inklusive Audit-Resolution.""" - if isinstance(note, dict) and "frontmatter" in note: fm = note["frontmatter"] - else: fm = note or {} + """ + Erstellt die Payloads für die Chunks inklusive Audit-Resolution. + Nutzt nun die zentrale Registry für alle Fallbacks. + """ + if isinstance(note, dict) and "frontmatter" in note: + fm = note["frontmatter"] + else: + fm = note or {} - reg = kwargs.get("types_cfg") or {} + # WP-14 Fix: Nutzt übergebene Registry oder lädt sie global + reg = kwargs.get("types_cfg") or load_type_registry() + note_type = fm.get("type") or "concept" title = fm.get("title") or fm.get("id") or "Untitled" tags = _as_list(fm.get("tags") or []) - # Audit: Resolution Hierarchie + # Audit: Resolution Hierarchie (Frontmatter > Registry) cp = fm.get("chunking_profile") or fm.get("chunk_profile") - if not cp: cp = _resolve_val(note_type, reg, "chunking_profile", "sliding_standard") + if not cp: + cp = _resolve_val(note_type, reg, "chunking_profile", "sliding_standard") rw = fm.get("retriever_weight") - if rw is None: rw = _resolve_val(note_type, reg, "retriever_weight", 1.0) - try: rw = float(rw) - except: rw = 1.0 + if rw is None: + rw = _resolve_val(note_type, reg, "retriever_weight", 1.0) + try: + rw = float(rw) + except: + rw = 1.0 out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): @@ -84,9 +105,10 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke "chunk_profile": cp } - # Audit: Cleanup Pop (Alias Felder entfernen) + # Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern) for alias in ("chunk_num", "Chunk_Number"): pl.pop(alias, None) out.append(pl) + return out \ No newline at end of file diff --git a/app/core/ingestion/ingestion_db.py b/app/core/ingestion/ingestion_db.py index 9acf096..64cd57f 100644 --- a/app/core/ingestion/ingestion_db.py +++ b/app/core/ingestion/ingestion_db.py @@ -1,31 +1,39 @@ """ FILE: app/core/ingestion/ingestion_db.py DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung. + WP-14: Umstellung auf zentrale database-Infrastruktur. """ from typing import Optional, Tuple from qdrant_client import QdrantClient from qdrant_client.http import models as rest +# Import der modularisierten Namen-Logik zur Sicherstellung der Konsistenz +from app.core.database import collection_names + def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]: """Holt die Metadaten einer Note aus Qdrant via Scroll.""" + notes_col, _, _ = collection_names(prefix) try: f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - pts, _ = client.scroll(collection_name=f"{prefix}_notes", scroll_filter=f, limit=1, with_payload=True) + pts, _ = client.scroll(collection_name=notes_col, scroll_filter=f, limit=1, with_payload=True) return pts[0].payload if pts else None except: return None def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]: """Prüft Qdrant aktiv auf vorhandene Chunks und Edges.""" + _, chunks_col, edges_col = collection_names(prefix) try: f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - c_pts, _ = client.scroll(collection_name=f"{prefix}_chunks", scroll_filter=f, limit=1) - e_pts, _ = client.scroll(collection_name=f"{prefix}_edges", scroll_filter=f, limit=1) + c_pts, _ = client.scroll(collection_name=chunks_col, scroll_filter=f, limit=1) + e_pts, _ = client.scroll(collection_name=edges_col, scroll_filter=f, limit=1) return (not bool(c_pts)), (not bool(e_pts)) except: return True, True def purge_artifacts(client: QdrantClient, prefix: str, note_id: str): """Löscht verwaiste Chunks/Edges vor einem Re-Import.""" + _, chunks_col, edges_col = collection_names(prefix) f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - for suffix in ["chunks", "edges"]: - try: client.delete(collection_name=f"{prefix}_{suffix}", points_selector=rest.FilterSelector(filter=f)) + # Iteration über die nun zentral verwalteten Collection-Namen + for col in [chunks_col, edges_col]: + try: client.delete(collection_name=col, points_selector=rest.FilterSelector(filter=f)) except: pass \ No newline at end of file diff --git a/app/core/ingestion/ingestion_note_payload.py b/app/core/ingestion/ingestion_note_payload.py index 28c5301..d41410b 100644 --- a/app/core/ingestion/ingestion_note_payload.py +++ b/app/core/ingestion/ingestion_note_payload.py @@ -3,8 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. FEATURES: - Multi-Hash (body/full) für flexible Change Detection. - - Fix v2.4.3: Vollständiger Audit-Check (Env-Vars, JSON-Validation, Edge-Defaults). -VERSION: 2.4.3 + - Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults. +VERSION: 2.4.4 STATUS: Active """ from __future__ import annotations @@ -14,6 +14,9 @@ import json import pathlib import hashlib +# Import der zentralen Registry-Logik +from app.core.registry import load_type_registry + # --------------------------------------------------------------------------- # Helper # --------------------------------------------------------------------------- @@ -42,12 +45,13 @@ def _compute_hash(content: str) -> str: return hashlib.sha256(content.encode("utf-8")).hexdigest() def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: - """Generiert den Hash-Input-String.""" + """Generiert den Hash-Input-String basierend auf Body oder Metadaten.""" body = str(n.get("body") or "") if mode == "body": return body if mode == "full": fm = n.get("frontmatter") or {} meta_parts = [] + # Sortierte Liste für deterministische Hashes for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): val = fm.get(k) if val is not None: meta_parts.append(f"{k}:{val}") @@ -55,13 +59,13 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: return body def _cfg_for_type(note_type: str, reg: dict) -> dict: - """Extrahiert Typ-spezifische Config.""" + """Extrahiert Typ-spezifische Config aus der Registry.""" if not isinstance(reg, dict): return {} types = reg.get("types") if isinstance(reg.get("types"), dict) else reg return types.get(note_type, {}) if isinstance(types, dict) else {} def _cfg_defaults(reg: dict) -> dict: - """Extrahiert globale Default-Werte.""" + """Extrahiert globale Default-Werte aus der Registry.""" if not isinstance(reg, dict): return {} for key in ("defaults", "default", "global"): v = reg.get(key) @@ -73,9 +77,14 @@ def _cfg_defaults(reg: dict) -> dict: # --------------------------------------------------------------------------- def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: - """Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.""" + """ + Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung. + WP-14: Nutzt nun die zentrale Registry für alle Fallbacks. + """ n = _as_dict(note) - reg = kwargs.get("types_cfg") or {} + + # Nutzt übergebene Registry oder lädt sie global + reg = kwargs.get("types_cfg") or load_type_registry() hash_source = kwargs.get("hash_source", "parsed") hash_normalize = kwargs.get("hash_normalize", "canonical") @@ -84,21 +93,26 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: cfg_type = _cfg_for_type(note_type, reg) cfg_def = _cfg_defaults(reg) + ingest_cfg = reg.get("ingestion_settings", {}) # --- retriever_weight Audit --- + # Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)) retriever_weight = fm.get("retriever_weight") if retriever_weight is None: retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", default_rw)) - try: retriever_weight = float(retriever_weight) - except: retriever_weight = default_rw + try: + retriever_weight = float(retriever_weight) + except: + retriever_weight = default_rw # --- chunk_profile Audit --- + # Nutzt nun primär die ingestion_settings aus der Registry chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") if chunk_profile is None: - chunk_profile = cfg_type.get("chunking_profile") + chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile") if chunk_profile is None: - chunk_profile = cfg_def.get("chunking_profile", "sliding_standard") + chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard")) # --- edge_defaults --- edge_defaults = fm.get("edge_defaults") @@ -124,17 +138,20 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: } # --- MULTI-HASH --- + # Generiert Hashes für Change Detection for mode in ["body", "full"]: content = _get_hash_source_content(n, mode) payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content) - # Metadaten + # Metadaten Anreicherung tags = fm.get("tags") or fm.get("keywords") or n.get("tags") if tags: payload["tags"] = _ensure_list(tags) if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases")) + for k in ("created", "modified", "date"): v = fm.get(k) or n.get(k) if v: payload[k] = str(v) + if n.get("body"): payload["fulltext"] = str(n["body"]) # Final JSON Validation Audit diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index 009f1fb..92a2a02 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -1,11 +1,11 @@ """ FILE: app/core/ingestion/ingestion_processor.py DESCRIPTION: Der zentrale IngestionService (Orchestrator). - WP-14: Vollständig modularisiert. + WP-14: Modularisierung der Datenbank-Ebene (app.core.database). WP-15b: Two-Pass Workflow mit globalem Kontext-Cache. WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert. - AUDIT v2.13.7: Synchronisierung des Context-Scanners mit der Registry (WP-14). -VERSION: 2.13.7 + AUDIT v2.13.10: Umstellung auf app.core.database Infrastruktur. +VERSION: 2.13.10 STATUS: Active """ import logging @@ -19,8 +19,10 @@ from app.core.parser import ( validate_required_frontmatter, NoteContext ) from app.core.chunking import assemble_chunks -from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes -from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch + +# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene +from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes +from app.core.database.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch # Services from app.services.embeddings_client import EmbeddingsClient @@ -44,12 +46,13 @@ logger = logging.getLogger(__name__) class IngestionService: def __init__(self, collection_prefix: str = None): - """Initialisiert den Service und stellt die DB-Verbindung bereit.""" + """Initialisiert den Service und nutzt die neue database-Infrastruktur.""" from app.config import get_settings self.settings = get_settings() self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX self.cfg = QdrantConfig.from_env() + # Synchronisierung der Konfiguration mit dem Instanz-Präfix self.cfg.prefix = self.prefix self.client = get_client(self.cfg) self.dim = self.settings.VECTOR_SIZE @@ -61,6 +64,7 @@ class IngestionService: self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache try: + # Aufruf der modularisierten Schema-Logik ensure_collections(self.client, self.prefix, self.dim) ensure_payload_indexes(self.client, self.prefix) except Exception as e: @@ -75,8 +79,7 @@ class IngestionService: logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...") for path in file_paths: try: - # ANPASSUNG: Übergabe der Registry für dynamische Scan-Parameter (WP-14) - # Ermöglicht die Nutzung von summary_settings aus types.yaml + # Übergabe der Registry für dynamische Scan-Tiefe ctx = pre_scan_markdown(path, registry=self.registry) if ctx: # Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname) @@ -110,7 +113,7 @@ class IngestionService: except Exception as e: return {**result, "error": f"Validation failed: {str(e)}"} - # Dynamischer Lifecycle-Filter aus der Registry + # Dynamischer Lifecycle-Filter aus der Registry (WP-14) ingest_cfg = self.registry.get("ingestion_settings", {}) ignore_list = ingest_cfg.get("ignore_statuses", ["system", "template", "archive", "hidden"]) @@ -180,7 +183,7 @@ class IngestionService: context={"file": file_path, "note_id": note_id, "line": e.get("line", "system")} ) - # 4. DB Upsert + # 4. DB Upsert via modularisierter Points-Logik if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id) diff --git a/app/core/ingestion/ingestion_validation.py b/app/core/ingestion/ingestion_validation.py index 038eebf..f7eea5c 100644 --- a/app/core/ingestion/ingestion_validation.py +++ b/app/core/ingestion/ingestion_validation.py @@ -1,11 +1,15 @@ """ FILE: app/core/ingestion/ingestion_validation.py DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache. + AUDIT v2.12.3: Integration der zentralen Text-Bereinigung (WP-14). """ import logging from typing import Dict, Any from app.core.parser import NoteContext +# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports +from app.core.registry import clean_llm_text + logger = logging.getLogger(__name__) async def validate_edge_candidate( @@ -15,7 +19,10 @@ async def validate_edge_candidate( llm_service: Any, provider: str ) -> bool: - """WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.""" + """ + WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache. + Nutzt clean_llm_text zur Entfernung von Steuerzeichen vor der Auswertung. + """ target_id = edge.get("to") target_ctx = batch_cache.get(target_id) @@ -40,7 +47,13 @@ async def validate_edge_candidate( edge_kind=edge.get("kind", "related_to") ) - response = await llm_service.generate_raw_response(prompt, priority="background") + # Die Antwort vom Service anfordern + raw_response = await llm_service.generate_raw_response(prompt, priority="background") + + # WP-14 Fix: Zusätzliche Bereinigung zur Sicherstellung der Interpretierbarkeit + response = clean_llm_text(raw_response) + + # Semantische Prüfung des Ergebnisses is_valid = "YES" in response.upper() if is_valid: @@ -50,4 +63,5 @@ async def validate_edge_candidate( return is_valid except Exception as e: logger.warning(f"⚠️ Validation error for {target_id}: {e}") + # Im Zweifel (Timeout/Fehler) erlauben wir die Kante, um Datenverlust zu vermeiden return True \ No newline at end of file diff --git a/app/core/qdrant.py b/app/core/qdrant.py index 950a75d..80f1c85 100644 --- a/app/core/qdrant.py +++ b/app/core/qdrant.py @@ -1,161 +1,22 @@ """ FILE: app/core/qdrant.py -DESCRIPTION: Qdrant-Client Factory und Schema-Management. Erstellt Collections und Payload-Indizes. -VERSION: 2.2.0 -STATUS: Active -DEPENDENCIES: qdrant_client, dataclasses, os -LAST_ANALYSIS: 2025-12-15 +DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14). + Leitet alle Aufrufe an das neue database-Paket weiter. +STATUS: Proxy (Legacy-Support) """ -from __future__ import annotations - -import os -from dataclasses import dataclass -from typing import Optional, Tuple, Dict, List - -from qdrant_client import QdrantClient -from qdrant_client.http import models as rest - - -# --------------------------------------------------------------------------- -# Konfiguration -# --------------------------------------------------------------------------- - -@dataclass -class QdrantConfig: - host: Optional[str] = None - port: Optional[int] = None - url: Optional[str] = None - api_key: Optional[str] = None - prefix: str = "mindnet" - dim: int = 384 - distance: str = "Cosine" # Cosine | Dot | Euclid - on_disk_payload: bool = True - - @classmethod - def from_env(cls) -> "QdrantConfig": - # Entweder URL ODER Host/Port, API-Key optional - url = os.getenv("QDRANT_URL") or None - host = os.getenv("QDRANT_HOST") or None - port = os.getenv("QDRANT_PORT") - port = int(port) if port else None - api_key = os.getenv("QDRANT_API_KEY") or None - prefix = os.getenv("COLLECTION_PREFIX") or "mindnet" - dim = int(os.getenv("VECTOR_DIM") or 384) - distance = os.getenv("DISTANCE", "Cosine") - on_disk_payload = (os.getenv("ON_DISK_PAYLOAD", "true").lower() == "true") - return cls( - host=host, port=port, url=url, api_key=api_key, - prefix=prefix, dim=dim, distance=distance, on_disk_payload=on_disk_payload - ) - - -def get_client(cfg: QdrantConfig) -> QdrantClient: - # QdrantClient akzeptiert entweder url=... oder host/port - if cfg.url: - return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=60.0) - return QdrantClient(host=cfg.host or "127.0.0.1", port=cfg.port or 6333, api_key=cfg.api_key, timeout=60.0) - - -# --------------------------------------------------------------------------- -# Collections -# --------------------------------------------------------------------------- - -def collection_names(prefix: str) -> Tuple[str, str, str]: - return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" - - -def _vector_params(dim: int, distance: str) -> rest.VectorParams: - # Distance: "Cosine" | "Dot" | "Euclid" - dist = getattr(rest.Distance, distance.capitalize(), rest.Distance.COSINE) - return rest.VectorParams(size=dim, distance=dist) - - -def ensure_collections(client: QdrantClient, prefix: str, dim: int) -> None: - """Legt mindnet_notes, mindnet_chunks, mindnet_edges an (falls nicht vorhanden).""" - notes, chunks, edges = collection_names(prefix) - - # notes - if not client.collection_exists(notes): - client.create_collection( - collection_name=notes, - vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")), - on_disk_payload=True, - ) - # chunks - if not client.collection_exists(chunks): - client.create_collection( - collection_name=chunks, - vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")), - on_disk_payload=True, - ) - # edges (Dummy-Vektor, Filter via Payload) - if not client.collection_exists(edges): - client.create_collection( - collection_name=edges, - vectors_config=_vector_params(1, "Dot"), - on_disk_payload=True, - ) - - -# --------------------------------------------------------------------------- -# Payload-Indizes -# --------------------------------------------------------------------------- - -def _ensure_index(client: QdrantClient, collection: str, field: str, schema: rest.PayloadSchemaType) -> None: - """Idempotentes Anlegen eines Payload-Indexes für ein Feld.""" - try: - client.create_payload_index(collection_name=collection, field_name=field, field_schema=schema, wait=True) - except Exception as e: - # Fehler ignorieren, falls Index bereits existiert oder Server "already indexed" meldet. - # Für Debugging ggf. Logging ergänzen. - _ = e - - -def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None: - """ - Stellt sicher, dass alle benötigten Payload-Indizes existieren. - - notes: note_id(KEYWORD), type(KEYWORD), title(TEXT), updated(INTEGER), tags(KEYWORD) - - chunks: note_id(KEYWORD), chunk_id(KEYWORD), index(INTEGER), type(KEYWORD), tags(KEYWORD) - - edges: note_id(KEYWORD), kind(KEYWORD), scope(KEYWORD), source_id(KEYWORD), target_id(KEYWORD), chunk_id(KEYWORD) - """ - notes, chunks, edges = collection_names(prefix) - - # NOTES - for field, schema in [ - ("note_id", rest.PayloadSchemaType.KEYWORD), - ("type", rest.PayloadSchemaType.KEYWORD), - ("title", rest.PayloadSchemaType.TEXT), - ("updated", rest.PayloadSchemaType.INTEGER), - ("tags", rest.PayloadSchemaType.KEYWORD), - ]: - _ensure_index(client, notes, field, schema) - - # CHUNKS - for field, schema in [ - ("note_id", rest.PayloadSchemaType.KEYWORD), - ("chunk_id", rest.PayloadSchemaType.KEYWORD), - ("index", rest.PayloadSchemaType.INTEGER), - ("type", rest.PayloadSchemaType.KEYWORD), - ("tags", rest.PayloadSchemaType.KEYWORD), - ]: - _ensure_index(client, chunks, field, schema) - - # EDGES - for field, schema in [ - ("note_id", rest.PayloadSchemaType.KEYWORD), - ("kind", rest.PayloadSchemaType.KEYWORD), - ("scope", rest.PayloadSchemaType.KEYWORD), - ("source_id", rest.PayloadSchemaType.KEYWORD), - ("target_id", rest.PayloadSchemaType.KEYWORD), - ("chunk_id", rest.PayloadSchemaType.KEYWORD), - ]: - _ensure_index(client, edges, field, schema) - +from .database.qdrant import ( + QdrantConfig, + get_client, + ensure_collections, + ensure_payload_indexes, + collection_names +) +# Re-Export für 100% Kompatibilität __all__ = [ "QdrantConfig", "get_client", "ensure_collections", "ensure_payload_indexes", "collection_names", -] +] \ No newline at end of file diff --git a/app/core/qdrant_points.py b/app/core/qdrant_points.py index 9c4b878..d136232 100644 --- a/app/core/qdrant_points.py +++ b/app/core/qdrant_points.py @@ -1,292 +1,24 @@ """ FILE: app/core/qdrant_points.py -DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs. -VERSION: 1.5.0 -STATUS: Active -DEPENDENCIES: qdrant_client, uuid, os -LAST_ANALYSIS: 2025-12-15 +DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14). + Leitet Point-Operationen an das neue database-Paket weiter. +STATUS: Proxy (Legacy-Support) """ -from __future__ import annotations -import os -import uuid -from typing import List, Tuple, Iterable, Optional, Dict, Any +from .database.qdrant_points import ( + points_for_note, + points_for_chunks, + points_for_edges, + upsert_batch, + get_edges_for_sources, + search_chunks_by_vector +) -from qdrant_client.http import models as rest -from qdrant_client import QdrantClient - -# --------------------- ID helpers --------------------- - -def _to_uuid(stable_key: str) -> str: - return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key)) - -def _names(prefix: str) -> Tuple[str, str, str]: - return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" - -# --------------------- Points builders --------------------- - -def points_for_note(prefix: str, note_payload: dict, note_vec: List[float] | None, dim: int) -> Tuple[str, List[rest.PointStruct]]: - notes_col, _, _ = _names(prefix) - vector = note_vec if note_vec is not None else [0.0] * int(dim) - raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id" - point_id = _to_uuid(raw_note_id) - pt = rest.PointStruct(id=point_id, vector=vector, payload=note_payload) - return notes_col, [pt] - -def points_for_chunks(prefix: str, chunk_payloads: List[dict], vectors: List[List[float]]) -> Tuple[str, List[rest.PointStruct]]: - _, chunks_col, _ = _names(prefix) - points: List[rest.PointStruct] = [] - for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1): - chunk_id = pl.get("chunk_id") or pl.get("id") - if not chunk_id: - note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note" - chunk_id = f"{note_id}#{i}" - pl["chunk_id"] = chunk_id - point_id = _to_uuid(chunk_id) - points.append(rest.PointStruct(id=point_id, vector=vec, payload=pl)) - return chunks_col, points - -def _normalize_edge_payload(pl: dict) -> dict: - kind = pl.get("kind") or pl.get("edge_type") or "edge" - source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src" - target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt" - seq = pl.get("seq") or pl.get("order") or pl.get("index") - - pl.setdefault("kind", kind) - pl.setdefault("source_id", source_id) - pl.setdefault("target_id", target_id) - if seq is not None and "seq" not in pl: - pl["seq"] = seq - return pl - -def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]: - _, _, edges_col = _names(prefix) - points: List[rest.PointStruct] = [] - for raw in edge_payloads: - pl = _normalize_edge_payload(raw) - edge_id = pl.get("edge_id") - if not edge_id: - kind = pl.get("kind", "edge") - s = pl.get("source_id", "unknown-src") - t = pl.get("target_id", "unknown-tgt") - seq = pl.get("seq") or "" - edge_id = f"{kind}:{s}->{t}#{seq}" - pl["edge_id"] = edge_id - point_id = _to_uuid(edge_id) - points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl)) - return edges_col, points - -# --------------------- Vector schema & overrides --------------------- - -def _preferred_name(candidates: List[str]) -> str: - for k in ("text", "default", "embedding", "content"): - if k in candidates: - return k - return sorted(candidates)[0] - -def _env_override_for_collection(collection: str) -> Optional[str]: - """ - Returns: - - "__single__" to force single-vector - - concrete name (str) to force named-vector with that name - - None to auto-detect - """ - base = os.getenv("MINDNET_VECTOR_NAME") - if collection.endswith("_notes"): - base = os.getenv("NOTES_VECTOR_NAME", base) - elif collection.endswith("_chunks"): - base = os.getenv("CHUNKS_VECTOR_NAME", base) - elif collection.endswith("_edges"): - base = os.getenv("EDGES_VECTOR_NAME", base) - - if not base: - return None - val = base.strip() - if val.lower() in ("__single__", "single"): - return "__single__" - return val # concrete name - -def _get_vector_schema(client: QdrantClient, collection_name: str) -> dict: - """ - Return {"kind": "single", "size": int} or {"kind": "named", "names": [...], "primary": str}. - """ - try: - info = client.get_collection(collection_name=collection_name) - vecs = getattr(info, "vectors", None) - # Single-vector config - if hasattr(vecs, "size") and isinstance(vecs.size, int): - return {"kind": "single", "size": vecs.size} - # Named-vectors config (dict-like in .config) - cfg = getattr(vecs, "config", None) - if isinstance(cfg, dict) and cfg: - names = list(cfg.keys()) - if names: - return {"kind": "named", "names": names, "primary": _preferred_name(names)} - except Exception: - pass - return {"kind": "single", "size": None} - -def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruct]: - out: List[rest.PointStruct] = [] - for pt in points: - vec = getattr(pt, "vector", None) - if isinstance(vec, dict): - if name in vec: - out.append(pt) - else: - # take any existing entry; if empty dict fallback to [0.0] - fallback_vec = None - try: - fallback_vec = list(next(iter(vec.values()))) - except Exception: - fallback_vec = [0.0] - out.append(rest.PointStruct(id=pt.id, vector={name: fallback_vec}, payload=pt.payload)) - elif vec is not None: - out.append(rest.PointStruct(id=pt.id, vector={name: vec}, payload=pt.payload)) - else: - out.append(pt) - return out - -# --------------------- Qdrant ops --------------------- - -def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct]) -> None: - if not points: - return - - # 1) ENV overrides come first - override = _env_override_for_collection(collection) - if override == "__single__": - client.upsert(collection_name=collection, points=points, wait=True) - return - elif isinstance(override, str): - client.upsert(collection_name=collection, points=_as_named(points, override), wait=True) - return - - # 2) Auto-detect schema - schema = _get_vector_schema(client, collection) - if schema.get("kind") == "named": - name = schema.get("primary") or _preferred_name(schema.get("names") or []) - client.upsert(collection_name=collection, points=_as_named(points, name), wait=True) - return - - # 3) Fallback single-vector - client.upsert(collection_name=collection, points=points, wait=True) - -# --- Optional search helpers --- - -def _filter_any(field: str, values: Iterable[str]) -> rest.Filter: - return rest.Filter(should=[rest.FieldCondition(key=field, match=rest.MatchValue(value=v)) for v in values]) - -def _merge_filters(*filters: Optional[rest.Filter]) -> Optional[rest.Filter]: - fs = [f for f in filters if f is not None] - if not fs: - return None - if len(fs) == 1: - return fs[0] - must = [] - for f in fs: - if getattr(f, "must", None): - must.extend(f.must) - if getattr(f, "should", None): - must.append(rest.Filter(should=f.should)) - return rest.Filter(must=must) - -def _filter_from_dict(filters: Optional[Dict[str, Any]]) -> Optional[rest.Filter]: - if not filters: - return None - parts = [] - for k, v in filters.items(): - if isinstance(v, (list, tuple, set)): - parts.append(_filter_any(k, [str(x) for x in v])) - else: - parts.append(rest.Filter(must=[rest.FieldCondition(key=k, match=rest.MatchValue(value=v))])) - return _merge_filters(*parts) - -def search_chunks_by_vector(client: QdrantClient, prefix: str, vector: List[float], top: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Tuple[str, float, dict]]: - _, chunks_col, _ = _names(prefix) - flt = _filter_from_dict(filters) - res = client.search(collection_name=chunks_col, query_vector=vector, limit=top, with_payload=True, with_vectors=False, query_filter=flt) - out: List[Tuple[str, float, dict]] = [] - for r in res: - out.append((str(r.id), float(r.score), dict(r.payload or {}))) - return out - - -# --- Edge retrieval helper --- - -def get_edges_for_sources( - client: QdrantClient, - prefix: str, - source_ids: Iterable[str], - edge_types: Optional[Iterable[str]] = None, - limit: int = 2048, -) -> List[Dict[str, Any]]: - """Retrieve edge payloads from the _edges collection. - - Args: - client: QdrantClient instance. - prefix: Mindnet collection prefix (e.g. "mindnet"). - source_ids: Iterable of source_id values (typically chunk_ids or note_ids). - edge_types: Optional iterable of edge kinds (e.g. ["references", "depends_on"]). If None, - all kinds are returned. - limit: Maximum number of edge payloads to return. - - Returns: - A list of edge payload dicts, e.g.: - { - "note_id": "...", - "chunk_id": "...", - "kind": "references" | "depends_on" | ..., - "scope": "chunk", - "source_id": "...", - "target_id": "...", - "rule_id": "...", - "confidence": 0.7, - ... - } - """ - source_ids = list(source_ids) - if not source_ids or limit <= 0: - return [] - - # Resolve collection name - _, _, edges_col = _names(prefix) - - # Build filter: source_id IN source_ids - src_filter = _filter_any("source_id", [str(s) for s in source_ids]) - - # Optional: kind IN edge_types - kind_filter = None - if edge_types: - kind_filter = _filter_any("kind", [str(k) for k in edge_types]) - - flt = _merge_filters(src_filter, kind_filter) - - out: List[Dict[str, Any]] = [] - next_page = None - remaining = int(limit) - - # Use paginated scroll API; we don't need vectors, only payloads. - while remaining > 0: - batch_limit = min(256, remaining) - res, next_page = client.scroll( - collection_name=edges_col, - scroll_filter=flt, - limit=batch_limit, - with_payload=True, - with_vectors=False, - offset=next_page, - ) - - if not res: - break - - for r in res: - out.append(dict(r.payload or {})) - remaining -= 1 - if remaining <= 0: - break - - if next_page is None or remaining <= 0: - break - - return out +# Re-Export für 100% Kompatibilität +__all__ = [ + "points_for_note", + "points_for_chunks", + "points_for_edges", + "upsert_batch", + "get_edges_for_sources", + "search_chunks_by_vector" +] \ No newline at end of file From 84909119586eaef4c526c4dce241af7d008a033f Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 20:26:00 +0100 Subject: [PATCH 21/23] modularisierung --- app/core/graph/graph_db_adapter.py | 13 +- app/core/retrieval/__init__.py | 25 ++ app/core/retrieval/retriever.py | 312 +++++++++++++++++++++++ app/core/retrieval/retriever_scoring.py | 121 +++++++++ app/core/retriever.py | 314 +----------------------- app/core/retriever_scoring.py | 130 ++-------- 6 files changed, 491 insertions(+), 424 deletions(-) create mode 100644 app/core/retrieval/__init__.py create mode 100644 app/core/retrieval/retriever.py create mode 100644 app/core/retrieval/retriever_scoring.py diff --git a/app/core/graph/graph_db_adapter.py b/app/core/graph/graph_db_adapter.py index e3fff2f..6ebbee4 100644 --- a/app/core/graph/graph_db_adapter.py +++ b/app/core/graph/graph_db_adapter.py @@ -1,11 +1,14 @@ """ FILE: app/core/graph/graph_db_adapter.py DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen. + AUDIT v1.1.0: Nutzt nun die zentrale database-Infrastruktur für Namen. """ from typing import List, Dict, Optional from qdrant_client import QdrantClient from qdrant_client.http import models as rest -from app.core.qdrant import collection_names + +# ENTSCHEIDENDER FIX: Nutzt die neue Infrastruktur für konsistente Collection-Namen +from app.core.database import collection_names def fetch_edges_from_qdrant( client: QdrantClient, @@ -21,6 +24,7 @@ def fetch_edges_from_qdrant( if not seeds or limit <= 0: return [] + # Konsistente Namensauflösung via database-Paket _, _, edges_col = collection_names(prefix) seed_conditions = [] @@ -40,11 +44,14 @@ def fetch_edges_from_qdrant( type_filter = rest.Filter(should=type_conds) must = [] - if seeds_filter: must.append(seeds_filter) - if type_filter: must.append(type_filter) + if seeds_filter: + must.append(seeds_filter) + if type_filter: + must.append(type_filter) flt = rest.Filter(must=must) if must else None + # Abfrage via Qdrant Scroll API pts, _ = client.scroll( collection_name=edges_col, scroll_filter=flt, diff --git a/app/core/retrieval/__init__.py b/app/core/retrieval/__init__.py new file mode 100644 index 0000000..3b66fb4 --- /dev/null +++ b/app/core/retrieval/__init__.py @@ -0,0 +1,25 @@ +""" +PACKAGE: app.core.retrieval +DESCRIPTION: Zentrale Schnittstelle für Retrieval-Operationen (Vektor- & Graph-Suche). + Bündelt Suche und mathematische Scoring-Engine. +""" +from .retriever import ( + Retriever, + hybrid_retrieve, + semantic_retrieve +) + +from .retriever_scoring import ( + get_weights, + compute_wp22_score, + get_status_multiplier +) + +__all__ = [ + "Retriever", + "hybrid_retrieve", + "semantic_retrieve", + "get_weights", + "compute_wp22_score", + "get_status_multiplier" +] \ No newline at end of file diff --git a/app/core/retrieval/retriever.py b/app/core/retrieval/retriever.py new file mode 100644 index 0000000..a6c3357 --- /dev/null +++ b/app/core/retrieval/retriever.py @@ -0,0 +1,312 @@ +""" +FILE: app/core/retrieval/retriever.py +DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion. + Nutzt retriever_scoring.py für die WP-22 Logik. + MODULARISIERUNG: Verschoben in das retrieval-Paket für WP-14. +VERSION: 0.6.16 +STATUS: Active +DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter +""" +from __future__ import annotations + +import os +import time +import logging +from typing import Any, Dict, List, Tuple, Iterable, Optional + +from app.config import get_settings +from app.models.dto import ( + QueryRequest, QueryResponse, QueryHit, + Explanation, ScoreBreakdown, Reason, EdgeDTO +) + +# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene +import app.core.database.qdrant as qdr +import app.core.database.qdrant_points as qp + +import app.services.embeddings_client as ec +import app.core.graph_adapter as ga + +# Mathematische Engine importieren (Bleibt vorerst in app.core) +from app.core.retriever_scoring import get_weights, compute_wp22_score + +logger = logging.getLogger(__name__) + +# ============================================================================== +# 1. CORE HELPERS & CONFIG LOADERS +# ============================================================================== + +def _get_client_and_prefix() -> Tuple[Any, str]: + """Initialisiert Qdrant Client und lädt Collection-Prefix via database-Paket.""" + cfg = qdr.QdrantConfig.from_env() + return qdr.get_client(cfg), cfg.prefix + + +def _get_query_vector(req: QueryRequest) -> List[float]: + """ + Vektorisiert die Anfrage. + FIX: Enthält try-except Block für unterschiedliche Signaturen von ec.embed_text. + """ + if req.query_vector: + return list(req.query_vector) + if not req.query: + raise ValueError("Kein Text oder Vektor für die Suche angegeben.") + + settings = get_settings() + + try: + # Versuch mit modernem Interface (WP-03 kompatibel) + return ec.embed_text(req.query, model_name=settings.MODEL_NAME) + except TypeError: + # Fallback für Signaturen, die 'model_name' nicht als Keyword akzeptieren + logger.debug("ec.embed_text does not accept 'model_name' keyword. Falling back.") + return ec.embed_text(req.query) + + +def _semantic_hits( + client: Any, + prefix: str, + vector: List[float], + top_k: int, + filters: Optional[Dict] = None +) -> List[Tuple[str, float, Dict[str, Any]]]: + """Führt die Vektorsuche via database-Points-Modul durch.""" + raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters) + # Strikte Typkonvertierung für Stabilität + return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits] + +# ============================================================================== +# 2. EXPLANATION LAYER (DEBUG & VERIFIABILITY) +# ============================================================================== + +def _build_explanation( + semantic_score: float, + payload: Dict[str, Any], + scoring_debug: Dict[str, Any], + subgraph: Optional[ga.Subgraph], + target_note_id: Optional[str], + applied_boosts: Optional[Dict[str, float]] = None +) -> Explanation: + """ + Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung. + Behebt Pydantic ValidationErrors durch explizite String-Sicherung. + """ + _, edge_w_cfg, _ = get_weights() + base_val = scoring_debug["base_val"] + + # 1. Detaillierter mathematischer Breakdown + breakdown = ScoreBreakdown( + semantic_contribution=base_val, + edge_contribution=base_val * scoring_debug["edge_impact_final"], + centrality_contribution=base_val * scoring_debug["cent_impact_final"], + raw_semantic=semantic_score, + raw_edge_bonus=scoring_debug["edge_bonus"], + raw_centrality=scoring_debug["cent_bonus"], + node_weight=float(payload.get("retriever_weight", 1.0)), + status_multiplier=scoring_debug["status_multiplier"], + graph_boost_factor=scoring_debug["graph_boost_factor"] + ) + + reasons: List[Reason] = [] + edges_dto: List[EdgeDTO] = [] + + # 2. Gründe für Semantik hinzufügen + if semantic_score > 0.85: + reasons.append(Reason(kind="semantic", message="Sehr hohe textuelle Übereinstimmung.", score_impact=base_val)) + elif semantic_score > 0.70: + reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val)) + + # 3. Gründe für Typ und Lifecycle + type_weight = float(payload.get("retriever_weight", 1.0)) + if type_weight != 1.0: + msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert" + reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0))) + + # 4. Kanten-Verarbeitung (Graph-Intelligence) + if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0: + raw_edges = [] + if hasattr(subgraph, "get_incoming_edges"): + raw_edges.extend(subgraph.get_incoming_edges(target_note_id) or []) + if hasattr(subgraph, "get_outgoing_edges"): + raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or []) + + for edge in raw_edges: + # FIX: Zwingende String-Konvertierung für Pydantic-Stabilität + src = str(edge.get("source") or "note_root") + tgt = str(edge.get("target") or target_note_id or "unknown_target") + kind = str(edge.get("kind", "related_to")) + prov = str(edge.get("provenance", "rule")) + conf = float(edge.get("confidence", 1.0)) + + direction = "in" if tgt == target_note_id else "out" + + edge_obj = EdgeDTO( + id=f"{src}->{tgt}:{kind}", + kind=kind, + source=src, + target=tgt, + weight=conf, + direction=direction, + provenance=prov, + confidence=conf + ) + edges_dto.append(edge_obj) + + # Die 3 wichtigsten Kanten als Begründung formulieren + top_edges = sorted(edges_dto, key=lambda e: e.confidence, reverse=True) + for e in top_edges[:3]: + peer = e.source if e.direction == "in" else e.target + prov_txt = "Bestätigte" if e.provenance == "explicit" else "KI-basierte" + boost_txt = f" [Boost x{applied_boosts.get(e.kind)}]" if applied_boosts and e.kind in applied_boosts else "" + + reasons.append(Reason( + kind="edge", + message=f"{prov_txt} Kante '{e.kind}'{boost_txt} von/zu '{peer}'.", + score_impact=edge_w_cfg * e.confidence + )) + + if scoring_debug["cent_bonus"] > 0.01: + reasons.append(Reason(kind="centrality", message="Die Notiz ist ein zentraler Informations-Hub.", score_impact=breakdown.centrality_contribution)) + + return Explanation( + breakdown=breakdown, + reasons=reasons, + related_edges=edges_dto if edges_dto else None, + applied_boosts=applied_boosts + ) + +# ============================================================================== +# 3. CORE RETRIEVAL PIPELINE +# ============================================================================== + +def _build_hits_from_semantic( + hits: Iterable[Tuple[str, float, Dict[str, Any]]], + top_k: int, + used_mode: str, + subgraph: ga.Subgraph | None = None, + explain: bool = False, + dynamic_edge_boosts: Dict[str, float] = None +) -> QueryResponse: + """Wandelt semantische Roh-Treffer in bewertete QueryHits um.""" + t0 = time.time() + enriched = [] + + for pid, semantic_score, payload in hits: + edge_bonus, cent_bonus = 0.0, 0.0 + target_id = payload.get("note_id") + + if subgraph and target_id: + try: + edge_bonus = float(subgraph.edge_bonus(target_id)) + cent_bonus = float(subgraph.centrality_bonus(target_id)) + except Exception: + pass + + # Mathematisches Scoring via WP-22 Engine + debug_data = compute_wp22_score( + semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts + ) + enriched.append((pid, semantic_score, payload, debug_data)) + + # Sortierung nach finalem mathematischen Score + enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True) + limited_hits = enriched_sorted[: max(1, top_k)] + + results: List[QueryHit] = [] + for pid, s_score, pl, dbg in limited_hits: + explanation_obj = None + if explain: + explanation_obj = _build_explanation( + semantic_score=float(s_score), + payload=pl, + scoring_debug=dbg, + subgraph=subgraph, + target_note_id=pl.get("note_id"), + applied_boosts=dynamic_edge_boosts + ) + + # Payload Text-Feld normalisieren + text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]") + + results.append(QueryHit( + node_id=str(pid), + note_id=str(pl.get("note_id", "unknown")), + semantic_score=float(s_score), + edge_bonus=dbg["edge_bonus"], + centrality_bonus=dbg["cent_bonus"], + total_score=dbg["total"], + source={ + "path": pl.get("path"), + "section": pl.get("section") or pl.get("section_title"), + "text": text_content + }, + payload=pl, + explanation=explanation_obj + )) + + return QueryResponse(results=results, used_mode=used_mode, latency_ms=int((time.time() - t0) * 1000)) + + +def hybrid_retrieve(req: QueryRequest) -> QueryResponse: + """ + Die Haupt-Einstiegsfunktion für die hybride Suche. + Kombiniert Vektorsuche mit Graph-Expansion und WP-22 Gewichtung. + """ + client, prefix = _get_client_and_prefix() + vector = list(req.query_vector) if req.query_vector else _get_query_vector(req) + top_k = req.top_k or 10 + + # 1. Semantische Seed-Suche + hits = _semantic_hits(client, prefix, vector, top_k=top_k, filters=req.filters) + + # 2. Graph Expansion Konfiguration + expand_cfg = req.expand if isinstance(req.expand, dict) else {} + depth = int(expand_cfg.get("depth", 1)) + boost_edges = getattr(req, "boost_edges", {}) or {} + + subgraph: ga.Subgraph | None = None + if depth > 0 and hits: + # Start-IDs für den Graph-Traversal sammeln + seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")}) + + if seed_ids: + try: + # Subgraph aus RAM/DB laden + subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types")) + + # --- WP-22: Kanten-Gewichtung im RAM-Graphen vor Bonus-Berechnung --- + if subgraph and hasattr(subgraph, "graph"): + for _, _, data in subgraph.graph.edges(data=True): + # A. Provenance Weighting (WP-22 Bonus für Herkunft) + prov = data.get("provenance", "rule") + # Belohnung: Explizite Links (1.0) > Smart (0.9) > Rule (0.7) + prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7) + + # B. Intent Boost Multiplikator (Vom Router dynamisch injiziert) + kind = data.get("kind") + intent_multiplier = boost_edges.get(kind, 1.0) + + # Finales Gewicht setzen (Basis * Provenance * Intent) + data["weight"] = data.get("weight", 1.0) * prov_w * intent_multiplier + + except Exception as e: + logger.error(f"Graph Expansion failed: {e}") + subgraph = None + + # 3. Scoring & Explanation Generierung + return _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges) + + +def semantic_retrieve(req: QueryRequest) -> QueryResponse: + """Standard Vektorsuche ohne Graph-Einfluss (WP-02 Fallback).""" + client, prefix = _get_client_and_prefix() + vector = _get_query_vector(req) + hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters) + return _build_hits_from_semantic(hits, req.top_k or 10, "semantic", explain=req.explain) + + +class Retriever: + """Schnittstelle für die asynchrone Suche.""" + async def search(self, request: QueryRequest) -> QueryResponse: + """Führt eine hybride Suche aus.""" + return hybrid_retrieve(request) \ No newline at end of file diff --git a/app/core/retrieval/retriever_scoring.py b/app/core/retrieval/retriever_scoring.py new file mode 100644 index 0000000..9a5aa97 --- /dev/null +++ b/app/core/retrieval/retriever_scoring.py @@ -0,0 +1,121 @@ +""" +FILE: app/core/retrieval/retriever_scoring.py +DESCRIPTION: Mathematische Kern-Logik für das WP-22 Scoring. + Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle. + MODULARISIERUNG: Verschoben in das retrieval-Paket für WP-14. +VERSION: 1.0.2 +STATUS: Active +DEPENDENCIES: app.config, typing +""" +import os +import logging +from functools import lru_cache +from typing import Any, Dict, Tuple, Optional + +try: + import yaml +except ImportError: + yaml = None + +logger = logging.getLogger(__name__) + +@lru_cache +def get_weights() -> Tuple[float, float, float]: + """ + Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration. + Priorität: + 1. config/retriever.yaml (Scoring-Sektion) + 2. Umgebungsvariablen (RETRIEVER_W_*) + 3. System-Defaults (1.0, 0.0, 0.0) + """ + from app.config import get_settings + settings = get_settings() + + # Defaults aus Settings laden + sem = float(getattr(settings, "RETRIEVER_W_SEM", 1.0)) + edge = float(getattr(settings, "RETRIEVER_W_EDGE", 0.0)) + cent = float(getattr(settings, "RETRIEVER_W_CENT", 0.0)) + + # Optionaler Override via YAML + config_path = os.getenv("MINDNET_RETRIEVER_CONFIG", "config/retriever.yaml") + if yaml and os.path.exists(config_path): + try: + with open(config_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + scoring = data.get("scoring", {}) + sem = float(scoring.get("semantic_weight", sem)) + edge = float(scoring.get("edge_weight", edge)) + cent = float(scoring.get("centrality_weight", cent)) + except Exception as e: + logger.warning(f"Retriever Configuration could not be fully loaded from {config_path}: {e}") + + return sem, edge, cent + +def get_status_multiplier(payload: Dict[str, Any]) -> float: + """ + WP-22 A: Content Lifecycle Multiplier. + Steuert das Ranking basierend auf dem Reifegrad der Information. + + - stable: 1.2 (Belohnung für verifiziertes Wissen) + - active: 1.0 (Standard-Gewichtung) + - draft: 0.5 (Bestrafung für unfertige Fragmente) + """ + status = str(payload.get("status", "active")).lower().strip() + if status == "stable": + return 1.2 + if status == "draft": + return 0.5 + return 1.0 + +def compute_wp22_score( + semantic_score: float, + payload: Dict[str, Any], + edge_bonus_raw: float = 0.0, + cent_bonus_raw: float = 0.0, + dynamic_edge_boosts: Optional[Dict[str, float]] = None +) -> Dict[str, Any]: + """ + Die zentrale mathematische Scoring-Formel der Mindnet Intelligence. + Implementiert das WP-22 Hybrid-Scoring (Semantic * Lifecycle * Graph). + + FORMEL: + Score = (Similarity * StatusMult) * (1 + (TypeWeight - 1) + ((EdgeW * EB + CentW * CB) * IntentBoost)) + + Returns: + Dict mit dem finalen 'total' Score und allen mathematischen Zwischenwerten für den Explanation Layer. + """ + sem_w, edge_w_cfg, cent_w_cfg = get_weights() + status_mult = get_status_multiplier(payload) + + # Retriever Weight (Type Boost aus types.yaml, z.B. 1.1 für Decisions) + node_weight = float(payload.get("retriever_weight", 1.0)) + + # 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status) + base_val = float(semantic_score) * status_mult + + # 2. Graph Boost Factor (Teil C: Intent-spezifische Verstärkung) + # Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt. + graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0 + + # 3. Einzelne Graph-Komponenten berechnen + edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor + cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor + + # 4. Finales Zusammenführen (Merging) + # (node_weight - 1.0) sorgt dafür, dass ein Gewicht von 1.0 keinen Einfluss hat (neutral). + total = base_val * (1.0 + (node_weight - 1.0) + edge_impact_final + cent_impact_final) + + # Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor) + final_score = max(0.0001, float(total)) + + return { + "total": final_score, + "edge_bonus": float(edge_bonus_raw), + "cent_bonus": float(cent_bonus_raw), + "status_multiplier": status_mult, + "graph_boost_factor": graph_boost_factor, + "type_impact": node_weight - 1.0, + "base_val": base_val, + "edge_impact_final": edge_impact_final, + "cent_impact_final": cent_impact_final + } \ No newline at end of file diff --git a/app/core/retriever.py b/app/core/retriever.py index 878de8d..055d764 100644 --- a/app/core/retriever.py +++ b/app/core/retriever.py @@ -1,310 +1,14 @@ """ FILE: app/core/retriever.py -DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion. - Nutzt retriever_scoring.py für die WP-22 Logik. - FIX: TypeError in embed_text (model_name) behoben. - FIX: Pydantic ValidationError (Target/Source) behoben. -VERSION: 0.6.15 (WP-22 Full & Stable) -STATUS: Active -DEPENDENCIES: app.config, app.models.dto, app.core.qdrant*, app.core.graph_adapter, app.core.retriever_scoring +DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14). + Leitet Retrieval-Anfragen an das neue retrieval-Paket weiter. +STATUS: Proxy (Legacy-Support) """ -from __future__ import annotations - -import os -import time -import logging -from typing import Any, Dict, List, Tuple, Iterable, Optional - -from app.config import get_settings -from app.models.dto import ( - QueryRequest, QueryResponse, QueryHit, - Explanation, ScoreBreakdown, Reason, EdgeDTO +from .retrieval.retriever import ( + Retriever, + hybrid_retrieve, + semantic_retrieve ) -import app.core.qdrant as qdr -import app.core.qdrant_points as qp -import app.services.embeddings_client as ec -import app.core.graph_adapter as ga -# Mathematische Engine importieren -from app.core.retriever_scoring import get_weights, compute_wp22_score - -logger = logging.getLogger(__name__) - -# ============================================================================== -# 1. CORE HELPERS & CONFIG LOADERS -# ============================================================================== - -def _get_client_and_prefix() -> Tuple[Any, str]: - """Initialisiert Qdrant Client und lädt Collection-Prefix.""" - cfg = qdr.QdrantConfig.from_env() - return qdr.get_client(cfg), cfg.prefix - - -def _get_query_vector(req: QueryRequest) -> List[float]: - """ - Vektorisiert die Anfrage. - FIX: Enthält try-except Block für unterschiedliche Signaturen von ec.embed_text. - """ - if req.query_vector: - return list(req.query_vector) - if not req.query: - raise ValueError("Kein Text oder Vektor für die Suche angegeben.") - - settings = get_settings() - - try: - # Versuch mit modernem Interface (WP-03 kompatibel) - return ec.embed_text(req.query, model_name=settings.MODEL_NAME) - except TypeError: - # Fallback für Signaturen, die 'model_name' nicht als Keyword akzeptieren - logger.debug("ec.embed_text does not accept 'model_name' keyword. Falling back.") - return ec.embed_text(req.query) - - -def _semantic_hits( - client: Any, - prefix: str, - vector: List[float], - top_k: int, - filters: Optional[Dict] = None -) -> List[Tuple[str, float, Dict[str, Any]]]: - """Führt die Vektorsuche durch und konvertiert Qdrant-Points in ein einheitliches Format.""" - raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters) - # Strikte Typkonvertierung für Stabilität - return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits] - -# ============================================================================== -# 2. EXPLANATION LAYER (DEBUG & VERIFIABILITY) -# ============================================================================== - -def _build_explanation( - semantic_score: float, - payload: Dict[str, Any], - scoring_debug: Dict[str, Any], - subgraph: Optional[ga.Subgraph], - target_note_id: Optional[str], - applied_boosts: Optional[Dict[str, float]] = None -) -> Explanation: - """ - Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung. - Behebt Pydantic ValidationErrors durch explizite String-Sicherung. - """ - _, edge_w_cfg, _ = get_weights() - base_val = scoring_debug["base_val"] - - # 1. Detaillierter mathematischer Breakdown - breakdown = ScoreBreakdown( - semantic_contribution=base_val, - edge_contribution=base_val * scoring_debug["edge_impact_final"], - centrality_contribution=base_val * scoring_debug["cent_impact_final"], - raw_semantic=semantic_score, - raw_edge_bonus=scoring_debug["edge_bonus"], - raw_centrality=scoring_debug["cent_bonus"], - node_weight=float(payload.get("retriever_weight", 1.0)), - status_multiplier=scoring_debug["status_multiplier"], - graph_boost_factor=scoring_debug["graph_boost_factor"] - ) - - reasons: List[Reason] = [] - edges_dto: List[EdgeDTO] = [] - - # 2. Gründe für Semantik hinzufügen - if semantic_score > 0.85: - reasons.append(Reason(kind="semantic", message="Sehr hohe textuelle Übereinstimmung.", score_impact=base_val)) - elif semantic_score > 0.70: - reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val)) - - # 3. Gründe für Typ und Lifecycle - type_weight = float(payload.get("retriever_weight", 1.0)) - if type_weight != 1.0: - msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert" - reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0))) - - # 4. Kanten-Verarbeitung (Graph-Intelligence) - if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0: - raw_edges = [] - if hasattr(subgraph, "get_incoming_edges"): - raw_edges.extend(subgraph.get_incoming_edges(target_note_id) or []) - if hasattr(subgraph, "get_outgoing_edges"): - raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or []) - - for edge in raw_edges: - # FIX: Zwingende String-Konvertierung für Pydantic-Stabilität - src = str(edge.get("source") or "note_root") - tgt = str(edge.get("target") or target_note_id or "unknown_target") - kind = str(edge.get("kind", "related_to")) - prov = str(edge.get("provenance", "rule")) - conf = float(edge.get("confidence", 1.0)) - - direction = "in" if tgt == target_note_id else "out" - - edge_obj = EdgeDTO( - id=f"{src}->{tgt}:{kind}", - kind=kind, - source=src, - target=tgt, - weight=conf, - direction=direction, - provenance=prov, - confidence=conf - ) - edges_dto.append(edge_obj) - - # Die 3 wichtigsten Kanten als Begründung formulieren - top_edges = sorted(edges_dto, key=lambda e: e.confidence, reverse=True) - for e in top_edges[:3]: - peer = e.source if e.direction == "in" else e.target - prov_txt = "Bestätigte" if e.provenance == "explicit" else "KI-basierte" - boost_txt = f" [Boost x{applied_boosts.get(e.kind)}]" if applied_boosts and e.kind in applied_boosts else "" - - reasons.append(Reason( - kind="edge", - message=f"{prov_txt} Kante '{e.kind}'{boost_txt} von/zu '{peer}'.", - score_impact=edge_w_cfg * e.confidence - )) - - if scoring_debug["cent_bonus"] > 0.01: - reasons.append(Reason(kind="centrality", message="Die Notiz ist ein zentraler Informations-Hub.", score_impact=breakdown.centrality_contribution)) - - return Explanation( - breakdown=breakdown, - reasons=reasons, - related_edges=edges_dto if edges_dto else None, - applied_boosts=applied_boosts - ) - -# ============================================================================== -# 3. CORE RETRIEVAL PIPELINE -# ============================================================================== - -def _build_hits_from_semantic( - hits: Iterable[Tuple[str, float, Dict[str, Any]]], - top_k: int, - used_mode: str, - subgraph: ga.Subgraph | None = None, - explain: bool = False, - dynamic_edge_boosts: Dict[str, float] = None -) -> QueryResponse: - """Wandelt semantische Roh-Treffer in hochgeladene, bewertete QueryHits um.""" - t0 = time.time() - enriched = [] - - for pid, semantic_score, payload in hits: - edge_bonus, cent_bonus = 0.0, 0.0 - target_id = payload.get("note_id") - - if subgraph and target_id: - try: - edge_bonus = float(subgraph.edge_bonus(target_id)) - cent_bonus = float(subgraph.centrality_bonus(target_id)) - except Exception: - pass - - # Mathematisches Scoring via WP-22 Engine - debug_data = compute_wp22_score( - semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts - ) - enriched.append((pid, semantic_score, payload, debug_data)) - - # Sortierung nach finalem mathematischen Score - enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True) - limited_hits = enriched_sorted[: max(1, top_k)] - - results: List[QueryHit] = [] - for pid, s_score, pl, dbg in limited_hits: - explanation_obj = None - if explain: - explanation_obj = _build_explanation( - semantic_score=float(s_score), - payload=pl, - scoring_debug=dbg, - subgraph=subgraph, - target_note_id=pl.get("note_id"), - applied_boosts=dynamic_edge_boosts - ) - - # Payload Text-Feld normalisieren - text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]") - - results.append(QueryHit( - node_id=str(pid), - note_id=str(pl.get("note_id", "unknown")), - semantic_score=float(s_score), - edge_bonus=dbg["edge_bonus"], - centrality_bonus=dbg["cent_bonus"], - total_score=dbg["total"], - source={ - "path": pl.get("path"), - "section": pl.get("section") or pl.get("section_title"), - "text": text_content - }, - payload=pl, - explanation=explanation_obj - )) - - return QueryResponse(results=results, used_mode=used_mode, latency_ms=int((time.time() - t0) * 1000)) - - -def hybrid_retrieve(req: QueryRequest) -> QueryResponse: - """ - Die Haupt-Einstiegsfunktion für die hybride Suche. - Kombiniert Vektorsuche mit Graph-Expansion, Provenance-Weighting und Intent-Boosting. - """ - client, prefix = _get_client_and_prefix() - vector = list(req.query_vector) if req.query_vector else _get_query_vector(req) - top_k = req.top_k or 10 - - # 1. Semantische Seed-Suche - hits = _semantic_hits(client, prefix, vector, top_k=top_k, filters=req.filters) - - # 2. Graph Expansion Konfiguration - expand_cfg = req.expand if isinstance(req.expand, dict) else {} - depth = int(expand_cfg.get("depth", 1)) - boost_edges = getattr(req, "boost_edges", {}) or {} - - subgraph: ga.Subgraph | None = None - if depth > 0 and hits: - # Start-IDs für den Graph-Traversal sammeln - seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")}) - - if seed_ids: - try: - # Subgraph aus RAM/DB laden - subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types")) - - # --- WP-22: Kanten-Gewichtung im RAM-Graphen vor Bonus-Berechnung --- - if subgraph and hasattr(subgraph, "graph"): - for _, _, data in subgraph.graph.edges(data=True): - # A. Provenance Weighting (WP-22 Bonus für Herkunft) - prov = data.get("provenance", "rule") - # Belohnung: Explizite Links (1.0) > Smart (0.9) > Rule (0.7) - prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7) - - # B. Intent Boost Multiplikator (Vom Router dynamisch injiziert) - kind = data.get("kind") - intent_multiplier = boost_edges.get(kind, 1.0) - - # Finales Gewicht setzen (Basis * Provenance * Intent) - data["weight"] = data.get("weight", 1.0) * prov_w * intent_multiplier - - except Exception as e: - logger.error(f"Graph Expansion failed: {e}") - subgraph = None - - # 3. Scoring & Explanation Generierung - return _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges) - - -def semantic_retrieve(req: QueryRequest) -> QueryResponse: - """Standard Vektorsuche ohne Graph-Einfluss (WP-02 Fallback).""" - client, prefix = _get_client_and_prefix() - vector = _get_query_vector(req) - hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters) - return _build_hits_from_semantic(hits, req.top_k or 10, "semantic", explain=req.explain) - - -class Retriever: - """Schnittstelle für die asynchrone Suche.""" - async def search(self, request: QueryRequest) -> QueryResponse: - """Führt eine hybride Suche aus.""" - return hybrid_retrieve(request) \ No newline at end of file +# Re-Export für 100% Kompatibilität +__all__ = ["Retriever", "hybrid_retrieve", "semantic_retrieve"] \ No newline at end of file diff --git a/app/core/retriever_scoring.py b/app/core/retriever_scoring.py index eb207ac..0aec2a7 100644 --- a/app/core/retriever_scoring.py +++ b/app/core/retriever_scoring.py @@ -1,120 +1,18 @@ """ FILE: app/core/retriever_scoring.py -DESCRIPTION: Mathematische Kern-Logik für das WP-22 Scoring. - Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle. -VERSION: 1.0.1 (WP-22 Full Math Engine) -STATUS: Active -DEPENDENCIES: app.config, typing +DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14). + Leitet Scoring-Berechnungen an das neue retrieval-Paket weiter. +STATUS: Proxy (Legacy-Support) """ -import os -import logging -from functools import lru_cache -from typing import Any, Dict, Tuple, Optional +from .retrieval.retriever_scoring import ( + get_weights, + compute_wp22_score, + get_status_multiplier +) -try: - import yaml -except ImportError: - yaml = None - -logger = logging.getLogger(__name__) - -@lru_cache -def get_weights() -> Tuple[float, float, float]: - """ - Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration. - Priorität: - 1. config/retriever.yaml (Scoring-Sektion) - 2. Umgebungsvariablen (RETRIEVER_W_*) - 3. System-Defaults (1.0, 0.0, 0.0) - """ - from app.config import get_settings - settings = get_settings() - - # Defaults aus Settings laden - sem = float(getattr(settings, "RETRIEVER_W_SEM", 1.0)) - edge = float(getattr(settings, "RETRIEVER_W_EDGE", 0.0)) - cent = float(getattr(settings, "RETRIEVER_W_CENT", 0.0)) - - # Optionaler Override via YAML - config_path = os.getenv("MINDNET_RETRIEVER_CONFIG", "config/retriever.yaml") - if yaml and os.path.exists(config_path): - try: - with open(config_path, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - scoring = data.get("scoring", {}) - sem = float(scoring.get("semantic_weight", sem)) - edge = float(scoring.get("edge_weight", edge)) - cent = float(scoring.get("centrality_weight", cent)) - except Exception as e: - logger.warning(f"Retriever Configuration could not be fully loaded from {config_path}: {e}") - - return sem, edge, cent - -def get_status_multiplier(payload: Dict[str, Any]) -> float: - """ - WP-22 A: Content Lifecycle Multiplier. - Steuert das Ranking basierend auf dem Reifegrad der Information. - - - stable: 1.2 (Belohnung für verifiziertes Wissen) - - active: 1.0 (Standard-Gewichtung) - - draft: 0.5 (Bestrafung für unfertige Fragmente) - """ - status = str(payload.get("status", "active")).lower().strip() - if status == "stable": - return 1.2 - if status == "draft": - return 0.5 - return 1.0 - -def compute_wp22_score( - semantic_score: float, - payload: Dict[str, Any], - edge_bonus_raw: float = 0.0, - cent_bonus_raw: float = 0.0, - dynamic_edge_boosts: Optional[Dict[str, float]] = None -) -> Dict[str, Any]: - """ - Die zentrale mathematische Scoring-Formel der Mindnet Intelligence. - Implementiert das WP-22 Hybrid-Scoring (Semantic * Lifecycle * Graph). - - FORMEL: - Score = (Similarity * StatusMult) * (1 + (TypeWeight - 1) + ((EdgeW * EB + CentW * CB) * IntentBoost)) - - Returns: - Dict mit dem finalen 'total' Score und allen mathematischen Zwischenwerten für den Explanation Layer. - """ - sem_w, edge_w_cfg, cent_w_cfg = get_weights() - status_mult = get_status_multiplier(payload) - - # Retriever Weight (Type Boost aus types.yaml, z.B. 1.1 für Decisions) - node_weight = float(payload.get("retriever_weight", 1.0)) - - # 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status) - base_val = float(semantic_score) * status_mult - - # 2. Graph Boost Factor (Teil C: Intent-spezifische Verstärkung) - # Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt. - graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0 - - # 3. Einzelne Graph-Komponenten berechnen - edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor - cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor - - # 4. Finales Zusammenführen (Merging) - # (node_weight - 1.0) sorgt dafür, dass ein Gewicht von 1.0 keinen Einfluss hat (neutral). - total = base_val * (1.0 + (node_weight - 1.0) + edge_impact_final + cent_impact_final) - - # Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor) - final_score = max(0.0001, float(total)) - - return { - "total": final_score, - "edge_bonus": float(edge_bonus_raw), - "cent_bonus": float(cent_bonus_raw), - "status_multiplier": status_mult, - "graph_boost_factor": graph_boost_factor, - "type_impact": node_weight - 1.0, - "base_val": base_val, - "edge_impact_final": edge_impact_final, - "cent_impact_final": cent_impact_final - } \ No newline at end of file +# Re-Export für 100% Kompatibilität +__all__ = [ + "get_weights", + "compute_wp22_score", + "get_status_multiplier" +] \ No newline at end of file From 7fa9ce81bdd3c1a704cf7b9844d29e9574d75da2 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 20:30:24 +0100 Subject: [PATCH 22/23] letzte anpassungen --- app/core/graph/graph_subgraph.py | 39 +++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/app/core/graph/graph_subgraph.py b/app/core/graph/graph_subgraph.py index 593b09e..b253a54 100644 --- a/app/core/graph/graph_subgraph.py +++ b/app/core/graph/graph_subgraph.py @@ -1,16 +1,25 @@ """ FILE: app/core/graph/graph_subgraph.py DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse. + Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung. + MODULARISIERUNG: Teil des graph-Pakets (WP-14). +VERSION: 1.1.0 +STATUS: Active """ import math from collections import defaultdict from typing import Dict, List, Optional, DefaultDict, Any, Set from qdrant_client import QdrantClient + +# Lokale Paket-Imports from .graph_weights import EDGE_BASE_WEIGHTS, calculate_edge_weight from .graph_db_adapter import fetch_edges_from_qdrant class Subgraph: - """Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen.""" + """ + Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen. + Wird für die Berechnung von Graph-Boni im Retriever genutzt. + """ def __init__(self) -> None: self.adj: DefaultDict[str, List[Dict]] = defaultdict(list) @@ -19,7 +28,10 @@ class Subgraph: self.out_degree: DefaultDict[str, int] = defaultdict(int) def add_edge(self, e: Dict) -> None: - """Fügt eine Kante hinzu und aktualisiert Indizes.""" + """ + Fügt eine Kante hinzu und aktualisiert Indizes. + Unterstützt Kontext-Notes für verbesserte Graph-Konnektivität. + """ src = e.get("source") tgt = e.get("target") kind = e.get("kind") @@ -29,15 +41,15 @@ class Subgraph: if not src or not tgt: return - # 1. Forward + # 1. Forward-Kante self.adj[src].append({"target": tgt, "kind": kind, "weight": weight}) self.out_degree[src] += 1 self.in_degree[tgt] += 1 - # 2. Reverse (WP-04b Explanation) + # 2. Reverse-Kante (für WP-04b Explanation Layer) self.reverse_adj[tgt].append({"source": src, "kind": kind, "weight": weight}) - # 3. Kontext-Note Handling + # 3. Kontext-Note Handling (erhöht die Zentralität der Parent-Note) if owner and owner != src: self.adj[owner].append({"target": tgt, "kind": kind, "weight": weight}) self.out_degree[owner] += 1 @@ -54,16 +66,21 @@ class Subgraph: return self.aggregate_edge_bonus(node_id) def centrality_bonus(self, node_id: str) -> float: - """Log-gedämpfte Zentralität (In-Degree).""" + """ + Log-gedämpfte Zentralität basierend auf dem In-Degree. + Begrenzt auf einen maximalen Boost von 0.15. + """ indeg = self.in_degree.get(node_id, 0) if indeg <= 0: return 0.0 return min(math.log1p(indeg) / 10.0, 0.15) def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]: + """Gibt alle ausgehenden Kanten einer Node zurück.""" return self.adj.get(node_id, []) def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]: + """Gibt alle eingehenden Kanten einer Node zurück.""" return self.reverse_adj.get(node_id, []) @@ -74,7 +91,10 @@ def expand( depth: int = 1, edge_types: Optional[List[str]] = None, ) -> Subgraph: - """Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.""" + """ + Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe. + Nutzt fetch_edges_from_qdrant für den Datenbankzugriff. + """ sg = Subgraph() frontier = set(seeds) visited = set() @@ -83,6 +103,7 @@ def expand( if not frontier: break + # Batch-Abfrage der Kanten für die aktuelle Ebene payloads = fetch_edges_from_qdrant(client, prefix, list(frontier), edge_types) next_frontier: Set[str] = set() @@ -91,12 +112,14 @@ def expand( if not src or not tgt: continue sg.add_edge({ - "source": src, "target": tgt, + "source": src, + "target": tgt, "kind": pl.get("kind", "edge"), "weight": calculate_edge_weight(pl), "note_id": pl.get("note_id"), }) + # BFS Logik: Neue Ziele in die nächste Frontier aufnehmen if tgt not in visited: next_frontier.add(str(tgt)) From fa909e2e7d0a0c1a72555bb5fc92768b8245b321 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 22:13:11 +0100 Subject: [PATCH 23/23] Dokumentation WP14&WP15b --- docs/00_General/00_glossary.md | 20 ++- .../03_tech_configuration.md | 117 ++++++++-------- .../03_tech_data_model.md | 19 +-- .../03_tech_ingestion_pipeline.md | 127 ++++++++---------- .../03_tech_retrieval_scoring.md | 85 +++++++----- docs/05_Development/05_developer_guide.md | 68 +++++----- docs/06_Roadmap/06_active_roadmap.md | 56 ++++---- docs/99_Archive/99_legacy_workpackages.md | 21 ++- 8 files changed, 267 insertions(+), 246 deletions(-) diff --git a/docs/00_General/00_glossary.md b/docs/00_General/00_glossary.md index 334278e..e14ead9 100644 --- a/docs/00_General/00_glossary.md +++ b/docs/00_General/00_glossary.md @@ -2,13 +2,13 @@ doc_type: glossary audience: all status: active -version: 2.8.0 -context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-76 Quoten-Steuerung und Mistral-safe Parsing." +version: 2.8.1 +context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-14 Modularisierung, WP-15b Two-Pass Ingestion und Mistral-safe Parsing." --- # Mindnet Glossar -**Quellen:** `01_edge_vocabulary.md`, `llm_service.py`, `ingestion.py`, `edge_registry.py` +**Quellen:** `01_edge_vocabulary.md`, `llm_service.py`, `ingestion.py`, `edge_registry.py`, `registry.py`, `qdrant.py` ## Kern-Entitäten @@ -21,11 +21,13 @@ context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-C ## Komponenten * **Edge Registry:** Der zentrale Dienst (SSOT), der Kanten-Typen validiert und Aliase in kanonische Typen auflöst. Nutzt `01_edge_vocabulary.md` als Basis. -* **LLM Service:** Der Hybrid-Client (v3.3.6), der Anfragen zwischen OpenRouter, Google Gemini und lokalem Ollama routet. Verwaltet Cloud-Timeouts und Quoten-Management. -* **Retriever:** Besteht in v2.7+ aus der Orchestrierung (`retriever.py`) und der mathematischen Scoring-Engine (`retriever_scoring.py`). +* **LLM Service:** Der Hybrid-Client (v3.3.6), der Anfragen zwischen OpenRouter, Google Gemini und lokalem Ollama routet. Verwaltet Cloud-Timeouts und Quoten-Management. Nutzt zur Text-Bereinigung nun die neutrale `registry.py`, um Circular Imports zu vermeiden. +* **Retriever:** Besteht in v2.7+ aus der Orchestrierung (`retriever.py`) und der mathematischen Scoring-Engine (`retriever_scoring.py`). Seit WP-14 im Paket `app.core.retrieval` gekapselt. * **Decision Engine:** Teil des Routers, der Intents erkennt und entsprechende **Boost-Faktoren** für das Retrieval injiziert. * **Traffic Control:** Verwaltet Prioritäten und drosselt Hintergrund-Tasks (z.B. Smart Edges) mittels Semaphoren und Timeouts (45s) zur Vermeidung von System-Hangs. * **Unknown Edges Log:** Die Datei `unknown_edges.jsonl`, in der das System Kanten-Typen protokolliert, die nicht im Dictionary gefunden wurden. +* **Database Package (WP-14):** Zentralisiertes Infrastruktur-Paket (`app.core.database`), das den Qdrant-Client (`qdrant.py`) und das Point-Mapping (`qdrant_points.py`) verwaltet. +* **LocalBatchCache (WP-15b):** Ein globaler In-Memory-Index, der während des Pass 1 Scans aufgebaut wird und Metadaten (IDs, Titel, Summaries) aller Notizen für die Kantenvalidierung bereithält. ## Konzepte & Features @@ -40,5 +42,9 @@ context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-C * `explicit`: Vom Mensch gesetzt (Prio 1). * `semantic_ai`: Von der KI im Turbo-Mode extrahiert und validiert (Prio 2). * `structure`: Durch System-Regeln/Matrix erzeugt (Prio 3). -* **Smart Edge Allocation:** KI-Verfahren zur Relevanzprüfung von Links für spezifische Textabschnitte. -* **Matrix Logic:** Bestimmung des Kanten-Typs basierend auf Quell- und Ziel-Entität (z.B. Erfahrung -> Wert = `based_on`). \ No newline at end of file +* **Smart Edge Allocation (WP-15b):** KI-Verfahren zur Relevanzprüfung von Links für spezifische Textabschnitte. Validiert Kandidaten semantisch gegen das Ziel im LocalBatchCache. +* **Matrix Logic:** Bestimmung des Kanten-Typs basierend auf Quell- und Ziel-Entität (z.B. Erfahrung -> Wert = `based_on`). +* **Two-Pass Workflow (WP-15b):** Optimiertes Ingestion-Verfahren: + * **Pass 1 (Pre-Scan):** Schnelles Scannen aller Dateien zur Befüllung des LocalBatchCache. + * **Pass 2 (Semantic Processing):** Tiefenverarbeitung (Chunking, Embedding, Validierung) nur für geänderte Dateien. +* **Circular Import Registry (WP-14):** Entkopplung von Kern-Logik (wie Textbereinigung) in eine neutrale `registry.py`, um Abhängigkeitsschleifen zwischen Diensten und Ingestion-Utilities zu verhindern. \ No newline at end of file diff --git a/docs/03_Technical_References/03_tech_configuration.md b/docs/03_Technical_References/03_tech_configuration.md index 150182a..77d4576 100644 --- a/docs/03_Technical_References/03_tech_configuration.md +++ b/docs/03_Technical_References/03_tech_configuration.md @@ -1,19 +1,19 @@ --- doc_type: technical_reference audience: developer, admin -scope: configuration, env, registry, scoring, resilience +scope: configuration, env, registry, scoring, resilience, modularization status: active -version: 2.8.0 -context: "Umfassende Referenztabellen für Umgebungsvariablen (inkl. Hybrid-Cloud & WP-76), YAML-Konfigurationen und die Edge Registry Struktur." +version: 2.9.1 +context: "Umfassende Referenztabellen für Umgebungsvariablen (inkl. Hybrid-Cloud & WP-76), YAML-Konfigurationen und die Edge Registry Struktur unter Berücksichtigung von WP-14." --- # Konfigurations-Referenz -Dieses Dokument beschreibt alle Steuerungsdateien von Mindnet. In der Version 2.8 wurde die Konfiguration professionalisiert, um die Edge Registry, dynamische Scoring-Parameter (Lifecycle & Intent) sowie die neue Hybrid-Cloud-Resilienz zu unterstützen. +Dieses Dokument beschreibt alle Steuerungsdateien von Mindnet. In der Version 2.9.1 wurde die Konfiguration professionalisiert, um die Edge Registry, dynamische Scoring-Parameter (Lifecycle & Intent), die neue Hybrid-Cloud-Resilienz sowie die modulare Datenbank-Infrastruktur (WP-14) zu unterstützen. ## 1. Environment Variablen (`.env`) -Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts. +Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts. Seit der Modularisierung in WP-14 unterstützen sie zudem die explizite Benennung von Vektoren für verschiedene Collections. | Variable | Default | Beschreibung | | :--- | :--- | :--- | @@ -21,6 +21,10 @@ Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts. | `QDRANT_API_KEY` | *(leer)* | Optionaler Key für Absicherung. | | `COLLECTION_PREFIX` | `mindnet` | Namensraum für Collections (erzeugt `{prefix}_notes` etc). | | `VECTOR_DIM` | `768` | **Muss 768 sein** (für Nomic Embeddings). | +| `MINDNET_VECTOR_NAME` | `default` | **Neu (WP-14):** Basis-Vektorname für Named Vectors Support. | +| `NOTES_VECTOR_NAME` | *(leer)* | **Neu (WP-14):** Spezifischer Vektorname für die Notes-Collection (Override). | +| `CHUNKS_VECTOR_NAME` | *(leer)* | **Neu (WP-14):** Spezifischer Vektorname für die Chunks-Collection (Override). | +| `EDGES_VECTOR_NAME` | *(leer)* | **Neu (WP-14):** Spezifischer Vektorname für die Edges-Collection (Override). | | `MINDNET_VOCAB_PATH` | *(Pfad)* | **Neu (WP-22):** Absoluter Pfad zur `01_edge_vocabulary.md`. Definiert den Ort des Dictionarys. | | `MINDNET_VAULT_ROOT` | `./vault` | Basis-Pfad für Datei-Operationen. | | `MINDNET_TYPES_FILE` | `config/types.yaml` | Pfad zur Typ-Registry. | @@ -38,23 +42,25 @@ Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts. | `MINDNET_LLM_MODEL` | `phi3:mini` | Name des lokalen Chat-Modells (Ollama). | | `MINDNET_EMBEDDING_MODEL` | `nomic-embed-text` | Name des Embedding-Modells (Ollama). | | `MINDNET_OLLAMA_URL` | `http://127.0.0.1:11434`| URL zum lokalen LLM-Server. | -| `MAX_OLLAMA_CHARS` | `10000`| Maximale Länge des Kontext-Strings, der an das lokale Modell gesendet wird. Verhindert Batch-Decoding-Fehler bei sehr großen Notiz-Historien. | +| `MAX_OLLAMA_CHARS` | `10000`| Maximale Länge des Kontext-Strings, der an das lokale Modell gesendet wird. | | `MINDNET_LLM_TIMEOUT` | `300.0` | Timeout in Sekunden für LLM-Anfragen. | | `MINDNET_API_TIMEOUT` | `300.0` | Globales API-Timeout für das Frontend. | | `MINDNET_LL_BACKGROUND_LIMIT`| `2` | **Traffic Control:** Max. parallele Hintergrund-Tasks (Semaphore). | | `MINDNET_CHANGE_DETECTION_MODE` | `full` | `full` (Text + Meta) oder `body` (nur Text). | +| `MINDNET_DEFAULT_RETRIEVER_WEIGHT` | `1.0` | **Neu (WP-22):** Systemweiter Standard für das Retriever-Gewicht einer Notiz. | --- ## 2. Typ-Registry (`types.yaml`) -Steuert das Import-Verhalten, Chunking und die Kanten-Logik pro Typ. +Steuert das Import-Verhalten, Chunking und die Kanten-Logik pro Typ. Die Auflösung erfolgt zentral über die modularisierte Registry in `app.core.registry`. ### 2.1 Konfigurations-Hierarchie (Override-Logik) Seit Version 2.7.0 gilt für `chunking_profile` und `retriever_weight` folgende Priorität: 1. **Frontmatter (Höchste Prio):** Ein Wert direkt in der Markdown-Datei überschreibt alles. 2. **Type Config:** Der Standardwert für den `type` aus `types.yaml`. -3. **Global Default:** Fallback aus `defaults` in `types.yaml`. +3. **Ingestion Settings (Neu WP-14):** Globale Konfiguration wie `default_chunk_profile` innerhalb des `ingestion_settings` Blocks. +4. **Global Default:** Fallback aus `defaults` in `types.yaml`. ## 2.2 Typ-Referenz & Stream-Logik (Vollständige Liste: 28 Typen) @@ -113,7 +119,7 @@ Dieser Stream speichert deine Erlebnisse, Fakten und externes Wissen als Belege. ## 3. Retriever Config (`retriever.yaml`) -Steuert die Gewichtung der Scoring-Formel und die neuen Lifecycle-Modifier. +Steuert die Gewichtung der Scoring-Formel und die neuen Lifecycle-Modifier. Seit WP-14 ist die mathematische Engine im Paket `app.core.retrieval` gekapselt. ```yaml version: 1.2 @@ -140,43 +146,36 @@ lifecycle_weights: system: 0.0 # Hard Skip via Ingestion # Die nachfolgenden Werte überschreiben die Defaults aus app/core/retriever_config. -# Wenn neue Kantentypen, z.B. durch Referenzierung innerhalb einer md-Datei im vault anders gewichtet werden sollen, dann muss hier die Konfiguration erfolgen edge_types: # --- KATEGORIE 1: LOGIK-BOOSTS (Relevanz-Treiber) --- - # Diese Kanten haben die Kraft, das semantische Ranking aktiv umzugestalten. - blocks: 1.6 # Kritisch: Risiken/Blocker müssen sofort sichtbar sein. - solves: 1.5 # Zielführend: Lösungen sind primäre Suchziele. - depends_on: 1.4 # Logisch: Harte fachliche Abhängigkeit. - resulted_in: 1.4 # Kausal: Ergebnisse und unmittelbare Konsequenzen. - followed_by: 1.3 # Sequenziell (User): Bewusst gesteuerte Wissenspfade. - caused_by: 1.2 # Kausal: Ursachen-Bezug (Basis für Intent-Boost). - preceded_by: 1.1 # Sequenziell (User): Rückwärts-Bezug in Logik-Ketten. + blocks: 1.6 + solves: 1.5 + depends_on: 1.4 + resulted_in: 1.4 + followed_by: 1.3 + caused_by: 1.2 + preceded_by: 1.1 # --- KATEGORIE 2: QUALITATIVER KONTEXT (Stabilitäts-Stützen) --- - # Diese Kanten liefern wichtigen Kontext, ohne das Ergebnis zu verfälschen. - guides: 1.1 # Qualitativ: Prinzipien oder Werte leiten das Thema. - part_of: 1.1 # Strukturell: Zieht übergeordnete Kontexte (Parents) mit hoch. - based_on: 0.8 # Fundament: Bezug auf Basis-Werte (kalibriert auf Safe-Retrieval). - derived_from: 0.6 # Historisch: Dokumentiert die Herkunft von Wissen. - uses: 0.6 # Instrumentell: Genutzte Werkzeuge, Methoden oder Ressourcen. + guides: 1.1 + part_of: 1.1 + based_on: 0.8 + derived_from: 0.6 + uses: 0.6 # --- KATEGORIE 3: THEMATISCHE NÄHE (Ähnlichkeits-Signal) --- - # Diese Werte verhindern den "Drift" in fachfremde Bereiche. - similar_to: 0.4 # Analytisch: Thematische Nähe (oft KI-generiert). + similar_to: 0.4 # --- KATEGORIE 4: SYSTEM-NUDGES (Technische Struktur) --- - # Reine Orientierungshilfen für das System; fast kein Einfluss auf das Ranking. - belongs_to: 0.2 # System: Verknüpft Chunks mit der Note (Metadaten-Träger). - next: 0.1 # System: Technische Lesereihenfolge der Absätze. - prev: 0.1 # System: Technische Lesereihenfolge der Absätze. + belongs_to: 0.2 + next: 0.1 + prev: 0.1 # --- KATEGORIE 5: WEICHE ASSOZIATIONEN (Rausch-Unterdrückung) --- - # Verhindert, dass lose Verknüpfungen das Ergebnis "verwässern". - references: 0.1 # Assoziativ: Einfacher Querverweis oder Erwähnung. - related_to: 0.05 # Minimal: Schwächste thematische Verbindung. + references: 0.1 + related_to: 0.05 ``` - --- ## 4. Edge Typen & Registry Referenz @@ -185,7 +184,7 @@ Die `EdgeRegistry` ist die **Single Source of Truth** für das Vokabular. ### 4.1 Dateistruktur & Speicherort Die Registry erwartet eine Markdown-Datei an folgendem Ort: -* **Standard-Pfad:** `/01_User_Manual/01_edge_vocabulary.md`. +* **Standard-Pfad:** `/_system/dictionary/edge_vocabulary.md`. * **Custom-Pfad:** Kann via `.env` Variable `MINDNET_VOCAB_PATH` überschrieben werden. ### 4.2 Aufbau des Dictionaries (Markdown-Schema) @@ -199,37 +198,30 @@ Die Datei muss eine Markdown-Tabelle enthalten, die vom Regex-Parser gelesen wir | **`caused_by`** | `ausgelöst_durch`, `wegen` | Kausalität: A löst B aus. | ``` -**Regeln für die Spalten:** -1. **Canonical:** Muss fett gedruckt sein (`**type**` oder `**`type`**`). Dies ist der Wert, der in der DB landet. -2. **Aliasse:** Kommagetrennte Liste von Synonymen. Diese werden beim Import automatisch zum Canonical aufgelöst. -3. **Beschreibung:** Rein informativ für den Nutzer. - ### 4.3 Verfügbare Kanten-Typen (System-Standard) -| System-Typ (Canonical) | Erlaubte Aliasse (User) | Beschreibung | -| :--------------------- | :--------------------------------------------------- | :-------------------------------------- | -| **`caused_by`** | `ausgelöst_durch`, `wegen`, `ursache_ist` | Kausalität: A löst B aus. | -| **`derived_from`** | `abgeleitet_von`, `quelle`, `inspiriert_durch` | Herkunft: A stammt von B. | -| **`based_on`** | `basiert_auf`, `fundament`, `grundlage` | Fundament: B baut auf A auf. | -| **`solves`** | `löst`, `beantwortet`, `fix_für` | Lösung: A ist Lösung für Problem B. | -| **`part_of`** | `teil_von`, `gehört_zu`, `cluster` | Hierarchie: Kind -> Eltern. | -| **`depends_on`** | `hängt_ab_von`, `braucht`, `requires`, `enforced_by` | Abhängigkeit: A braucht B. | -| **`blocks`** | `blockiert`, `verhindert`, `risiko_für` | Blocker: A verhindert B. | -| **`uses`** | `nutzt`, `verwendet`, `tool` | Werkzeug: A nutzt B. | -| **`guides`** | `steuert`, `leitet`, `orientierung` | Soft-Dependency: A gibt Richtung für B. | -| **`followed_by`** | `danach`, `folgt`, `nachfolger`, `followed_by` | Prozess: A -> B. | -| **`preceeded_by`** | `davor`, `vorgänger`, `preceded_by` | Prozess: B <- A. | -| **`related_to`** | `siehe_auch`, `kontext`, `thematisch` | Lose Assoziation. | -| **`similar_to`** | `ähnlich_wie`, `vergleichbar` | Synonym / Ähnlichkeit. | -| **`references`** | *(Kein Alias)* | Standard-Verweis (Fallback). | -| **`resulted_in`** | `ergebnis`, `resultat`, `erzeugt` | Herkunft: A erzeugt Ergebnis B | +| System-Typ (Canonical) | Erlaubte Aliasse (User) | Beschreibung | +| :--- | :--- | :--- | +| **`caused_by`** | `ausgelöst_durch`, `wegen`, `ursache_ist` | Kausalität: A löst B aus. | +| **`derived_from`** | `abgeleitet_von`, `quelle`, `inspiriert_durch` | Herkunft: A stammt von B. | +| **`based_on`** | `basiert_auf`, `fundament`, `grundlage` | Fundament: B baut auf A auf. | +| **`solves`** | `löst`, `beantwortet`, `fix_für` | Lösung: A ist Lösung für Problem B. | +| **`part_of`** | `teil_von`, `gehört_zu`, `cluster` | Hierarchie: Kind -> Eltern. | +| **`depends_on`** | `hängt_ab_von`, `braucht`, `requires`, `enforced_by` | Abhängigkeit: A braucht B. | +| **`blocks`** | `blockiert`, `verhindert`, `risiko_für` | Blocker: A verhindert B. | +| **`uses`** | `nutzt`, `verwendet`, `tool` | Werkzeug: A nutzt B. | +| **`guides`** | `steuert`, `leitet`, `orientierung` | Soft-Dependency: A gibt Richtung für B. | +| **`followed_by`** | `danach`, `folgt`, `nachfolger`, `followed_by` | Prozess: A -> B. | +| **`preceeded_by`** | `davor`, `vorgänger`, `preceded_by` | Prozess: B <- A. | +| **`related_to`** | `siehe_auch`, `kontext`, `thematisch` | Lose Assoziation. | +| **`similar_to`** | `ähnlich_wie`, `vergleichbar` | Synonym / Ähnlichkeit. | +| **`references`** | *(Kein Alias)* | Standard-Verweis (Fallback). | +| **`resulted_in`** | `ergebnis`, `resultat`, `erzeugt` | Herkunft: A erzeugt Ergebnis B | -**ACHTUNG!** Die Kantentypen -**belongs_to**, **next** und **prev** dürfen nicht vom Nutzer gesetzt werden +**ACHTUNG!** Die Kantentypen **belongs_to**, **next** und **prev** dürfen nicht vom Nutzer gesetzt werden. --- - ## 5. Decision Engine (`decision_engine.yaml`) Die Decision Engine fungiert als zentraler Orchestrator für die Intent-Erkennung und das dynamische Retrieval-Routing. Sie bestimmt, wie das System auf eine Nutzeranfrage reagiert, welche Informationstypen bevorzugt werden und wie der Wissensgraph für die spezifische Situation verformt wird. @@ -323,7 +315,4 @@ strategies: BITTE WÄGE FAKTEN GEGEN FOLGENDE WERTE, PRINZIPIEN UND ZIELE AB: # 3. Empathie / "Ich"-Modus - -``` - -*Richtwert für Kanten-Boosts: 0.1 (Abwertung) bis 3.0+ (Dominanz gegenüber Text-Match).* \ No newline at end of file +``` \ No newline at end of file diff --git a/docs/03_Technical_References/03_tech_data_model.md b/docs/03_Technical_References/03_tech_data_model.md index 00e63c2..6492522 100644 --- a/docs/03_Technical_References/03_tech_data_model.md +++ b/docs/03_Technical_References/03_tech_data_model.md @@ -3,15 +3,15 @@ doc_type: technical_reference audience: developer, architect scope: database, qdrant, schema status: active -version: 2.7.0 -context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen." +version: 2.8.0 +context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen. Berücksichtigt WP-14 Modularisierung und WP-15b Multi-Hashes." --- # Technisches Datenmodell (Qdrant Schema) ## 1. Collections & Namenskonvention -Mindnet speichert Daten in drei getrennten Qdrant-Collections. Der Prefix ist via ENV `COLLECTION_PREFIX` konfigurierbar (Default: `mindnet`). +Mindnet speichert Daten in drei getrennten Qdrant-Collections. Der Prefix ist via ENV `COLLECTION_PREFIX` konfigurierbar (Default: `mindnet`). Die Auflösung erfolgt zentral über `app.core.database.collection_names`. Das System nutzt folgende drei Collections: * `{prefix}_notes`: Metadaten der Dateien. @@ -28,9 +28,10 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung). ```json { - "note_id": "string (keyword)", // UUIDv5 (deterministisch) oder Slug + "note_id": "string (keyword)", // UUIDv5 (deterministisch via NAMESPACE_URL) "title": "string (text)", // Titel aus Frontmatter "type": "string (keyword)", // Logischer Typ (z.B. 'project', 'concept') + "status": "string (keyword)", // Lifecycle: 'stable', 'active', 'draft', 'system' (WP-22) "retriever_weight": "float", // Effektive Wichtigkeit (Frontmatter > Type > Default) "chunk_profile": "string", // Effektives Profil (Frontmatter > Type > Default) "edge_defaults": ["string"], // Liste der aktiven Default-Kanten @@ -40,7 +41,7 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung). "updated": "integer", // Timestamp (File Modification Time) "fulltext": "string (no-index)", // Gesamter Text (nur für Recovery/Export) - // NEU in v2.7: Multi-Hash für flexible Change Detection + // Multi-Hash für flexible Change Detection (WP-15b) "hashes": { "body:parsed:canonical": "string", // Hash nur über den Text-Body "full:parsed:canonical": "string" // Hash über Text + Metadaten (Tags, Title, Config) @@ -52,6 +53,7 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung). Es müssen Payload-Indizes für folgende Felder existieren: * `note_id` * `type` +* `status` * `tags` --- @@ -61,7 +63,7 @@ Es müssen Payload-Indizes für folgende Felder existieren: Die atomare Sucheinheit. Enthält den Vektor. **Vektor-Konfiguration:** -* Modell: `nomic-embed-text` +* Modell: `nomic-embed-text` (via Ollama oder Cloud) * Dimension: **768** * Metrik: Cosine Similarity @@ -69,7 +71,7 @@ Die atomare Sucheinheit. Enthält den Vektor. ```json { - "chunk_id": "string (keyword)", // Format: {note_id}#c{index} (z.B. 'abc-123#c01') + "chunk_id": "string (keyword)", // Format: UUIDv5 aus {note_id}#c{index} "note_id": "string (keyword)", // Foreign Key zur Note "type": "string (keyword)", // Kopie aus Note (Denormalisiert für Filterung) "text": "string (text)", // Reintext für Anzeige (ohne Overlap) @@ -120,4 +122,5 @@ Es müssen Payload-Indizes für folgende Felder existieren: * `target_id` * `kind` * `scope` -* `note_id` \ No newline at end of file +* `note_id` +``` \ No newline at end of file diff --git a/docs/03_Technical_References/03_tech_ingestion_pipeline.md b/docs/03_Technical_References/03_tech_ingestion_pipeline.md index 901a05d..146baa3 100644 --- a/docs/03_Technical_References/03_tech_ingestion_pipeline.md +++ b/docs/03_Technical_References/03_tech_ingestion_pipeline.md @@ -1,71 +1,77 @@ --- doc_type: technical_reference audience: developer, devops -scope: backend, ingestion, smart_edges, edge_registry +scope: backend, ingestion, smart_edges, edge_registry, modularization status: active -version: 2.8.1 -context: "Detaillierte technische Beschreibung der Import-Pipeline, Mistral-safe Parsing und Deep Fallback Resilienz." +version: 2.9.0 +context: "Detaillierte technische Beschreibung der Import-Pipeline, Two-Pass-Workflow (WP-15b) und modularer Datenbank-Architektur (WP-14). Integriert Mistral-safe Parsing und Deep Fallback." --- # Ingestion Pipeline & Smart Processing -**Quellen:** `pipeline_playbook.md`, `ingestion.py`, `edge_registry.py`, `01_edge_vocabulary.md`, `llm_service.py` +**Quellen:** `pipeline_playbook.md`, `ingestion_processor.py`, `ingestion_db.py`, `ingestion_validation.py`, `registry.py`, `edge_registry.py` + +Die Ingestion transformiert Markdown in den Graphen. Entrypoint: `scripts/import_markdown.py` (CLI) oder `routers/ingest.py` (API). Seit v2.9 nutzt dieser Prozess ein hocheffizientes **Two-Pass-Verfahren**, um globale Kontext-Informationen für die semantische Validierung bereitzustellen, ohne die Idempotenz oder die Change-Detection zu verletzen. + -Die Ingestion transformiert Markdown in den Graphen. Entrypoint: `scripts/import_markdown.py` (CLI) oder `routers/ingest.py` (API). Seit v2.8 integriert dieser Prozess eine **intelligente Quoten-Steuerung** (WP-20) und ein **robustes JSON-Parsing** für Cloud-Modelle (Mistral/Gemini). ## 1. Der Import-Prozess (16-Schritte-Workflow) -Der Prozess ist **asynchron** und **idempotent**. +Der Prozess ist **asynchron**, **idempotent** und wird nun in zwei logische Durchläufe (Passes) unterteilt, um die semantische Genauigkeit zu maximieren. +### Phase 1: Pre-Scan & Context (Pass 1) 1. **Trigger & Async Dispatch:** * **API (`/save`):** Nimmt Request entgegen, validiert und startet Background-Task ("Fire & Forget"). Antwortet sofort mit `202/Queued`. * **CLI:** Iteriert über Dateien und nutzt `asyncio.Semaphore` zur Drosselung. -2. **Markdown lesen:** Rekursives Scannen des Vaults. +2. **Markdown lesen:** Rekursives Scannen des Vaults zur Erstellung des Dateiinventars. 3. **Frontmatter Check & Hard Skip (WP-22):** * Extraktion von `status` und `type`. - * **Hard Skip Rule:** Wenn `status` in `['system', 'template', 'archive', 'hidden']` ist, wird die Datei **sofort übersprungen**. Sie wird weder vektorisiert noch in den Graphen aufgenommen. + * **Hard Skip Rule:** Wenn `status` in `['system', 'template', 'archive', 'hidden']` ist, wird die Datei für das Deep-Processing übersprungen, ihre Metadaten werden jedoch für den Kontext-Cache erfasst. * Validierung der Pflichtfelder (`id`, `title`) für alle anderen Dateien. 4. **Edge Registry Initialisierung (WP-22):** * Laden der Singleton-Instanz der `EdgeRegistry`. * Validierung der Vokabular-Datei unter `MINDNET_VOCAB_PATH`. -5. **Config Resolution:** - * Bestimmung von `chunking_profile` und `retriever_weight`. +5. **Config Resolution (WP-14):** + * Bestimmung von `chunking_profile` und `retriever_weight` via zentraler `TypeRegistry`. * **Priorität:** 1. Frontmatter (Override) -> 2. `types.yaml` (Type) -> 3. Global Default. -6. **Note-Payload generieren:** - * Erstellen des JSON-Objekts inklusive `status` (für Scoring). - * **Multi-Hash Calculation:** Berechnet Hashtabellen für `body` (nur Text) und `full` (Text + Metadaten). -7. **Change Detection:** - * Vergleich des Hashes mit Qdrant. - * Strategie wählbar via ENV `MINDNET_CHANGE_DETECTION_MODE` (`full` oder `body`). -8. **Chunking anwenden:** Zerlegung des Textes basierend auf dem ermittelten Profil (siehe Kap. 3). -9. **Smart Edge Allocation (WP-20):** - * Wenn `enable_smart_edge_allocation: true`: Der `SemanticAnalyzer` sendet Chunks an das LLM. - * **Traffic Control:** Request nutzt `priority="background"`. Semaphore drosselt die Last. - * **Resilienz (Quota Handling):** Erkennt HTTP 429 (Rate-Limit) und pausiert kontrolliert (via `LLM_RATE_LIMIT_WAIT`), bevor ein Cloud-Retry erfolgt. - * **Mistral-safe Parsing:** Automatisierte Bereinigung von BOS-Tokens (``) und Framework-Tags (`[OUT]`) sowie Recovery-Logik für Dictionaries (Suche nach `edges`, `links`, `results`, `kanten`). - * **Deep Fallback (v2.11.14):** Erkennt "Silent Refusals" (Data Policy Violations). Liefert die Cloud trotz erfolgreicher Verbindung keine verwertbaren Kanten, wird ein lokaler Fallback via Ollama erzwungen, um Kantenverlust zu vermeiden. -10. **Inline-Kanten finden:** Parsing von `[[rel:...]]`. -11. **Alias-Auflösung & Kanonisierung (WP-22):** - * Jede Kante wird via `edge_registry.resolve()` normalisiert. - * Aliase (z.B. `basiert_auf`) werden zu kanonischen Typen (z.B. `based_on`) aufgelöst. +6. **LocalBatchCache & Summary Generation (WP-15b):** + * Erstellung von Kurz-Zusammenfassungen für jede Note. + * Speicherung im `batch_cache` als Referenzrahmen für die spätere Kantenvalidierung. + +### Phase 2: Semantic Processing & Persistence (Pass 2) +7. **Note-Payload & Multi-Hash (WP-15b):** + * Erstellen des JSON-Objekts inklusive `status`. + * **Multi-Hash Calculation:** Berechnet Hashtabellen für `body` (nur Text) und `full` (Text + Metadaten) zur präzisen Änderungskontrolle. +8. **Change Detection:** + * Vergleich des aktuellen Hashes mit den Daten in Qdrant (Collection `{prefix}_notes`). + * Strategie wählbar via ENV `MINDNET_CHANGE_DETECTION_MODE` (`full` oder `body`). Unveränderte Dateien werden hier final übersprungen. +9. **Purge Old Artifacts (WP-14):** + * Bei Änderungen löscht `purge_artifacts()` via `app.core.ingestion.ingestion_db` alle alten Chunks und Edges der Note. + * Die Namensauflösung erfolgt nun über das modularisierte `database`-Paket. +10. **Chunking anwenden:** Zerlegung des Textes basierend auf dem ermittelten Profil (siehe Kap. 3). +11. **Smart Edge Allocation & Semantic Validation (WP-15b):** + * Der `SemanticAnalyzer` schlägt Kanten-Kandidaten vor. + * **Validierung:** Jeder Kandidat wird durch das LLM semantisch gegen das Ziel im **LocalBatchCache** geprüft. + * **Traffic Control:** Nutzung der neutralen `clean_llm_text` Funktion zur Bereinigung von Steuerzeichen (, [OUT]). + * **Deep Fallback (v2.11.14):** Erkennt "Silent Refusals". Liefert die Cloud keine verwertbaren Kanten, wird ein lokaler Fallback via Ollama erzwungen. +12. **Inline-Kanten finden:** Parsing von `[[rel:...]]` und Callouts. +13. **Alias-Auflösung & Kanonisierung (WP-22):** + * Jede Kante wird via `EdgeRegistry` normalisiert (z.B. `basiert_auf` -> `based_on`). * Unbekannte Typen werden in `unknown_edges.jsonl` protokolliert. -12. **Callout-Kanten finden:** Parsing von `> [!edge]`. -13. **Default- & Matrix-Edges erzeugen:** Anwendung der `edge_defaults` aus Registry und Matrix-Logik. -14. **Strukturkanten erzeugen:** `belongs_to`, `next`, `prev`. -15. **Embedding (Async):** Generierung via `nomic-embed-text` (768 Dim). -16. **Diagnose:** Integritäts-Check nach dem Lauf. +14. **Default- & Strukturkanten:** Anwendung der `edge_defaults` und Erzeugung von Systemkanten (`belongs_to`, `next`, `prev`). +15. **Embedding (Async):** Generierung der Vektoren via `nomic-embed-text` (768 Dimensionen). +16. **Database Sync (WP-14):** Batch-Upsert aller Points in die Collections `{prefix}_chunks` und `{prefix}_edges` über die zentrale Infrastruktur. --- ## 2. Betrieb & CLI Befehle ### 2.1 Standard-Betrieb (Inkrementell) -Für regelmäßige Updates (Cronjob). Erkennt Änderungen via Hash. +Erkennt Änderungen via Multi-Hash. ```bash export QDRANT_URL="http://localhost:6333" export COLLECTION_PREFIX="mindnet" -# Steuert, wann eine Datei als "geändert" gilt export MINDNET_CHANGE_DETECTION_MODE="full" # Nutzt das Venv der Produktionsumgebung @@ -78,20 +84,13 @@ export MINDNET_CHANGE_DETECTION_MODE="full" ``` > **[!WARNING] Purge-Before-Upsert** -> Das Flag `--purge-before-upsert` ist kritisch. Es löscht vor dem Schreiben einer Note ihre alten Chunks/Edges. Ohne dieses Flag entstehen **"Geister-Chunks"** (alte Textabschnitte, die im Markdown gelöscht wurden, aber im Index verbleiben). +> Das Flag `--purge-before-upsert` nutzt nun `ingestion_db.purge_artifacts`. Es ist kritisch, um "Geister-Chunks" (verwaiste Daten nach Textlöschung) konsistent aus den spezialisierten Collections zu entfernen. ### 2.2 Full Rebuild (Clean Slate) -Notwendig bei Änderungen an `types.yaml` (z.B. neue Chunking-Profile), der Registry oder Modell-Wechsel. +Notwendig bei Änderungen an `types.yaml`, der Registry oder Modell-Wechsel. ```bash -# 0. Modell sicherstellen -ollama pull nomic-embed-text - -# 1. Qdrant Collections löschen (Wipe) -python3 -m scripts.reset_qdrant --mode wipe --prefix "mindnet" --yes - -# 2. Vollständiger Import (Force) -# --force ignoriert alle Hashes und schreibt alles neu +# --force ignoriert alle Hashes und erzwingt den vollständigen Two-Pass Workflow python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force ``` @@ -99,22 +98,20 @@ python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply -- ## 3. Chunking & Payload -Das Chunking ist profilbasiert und in `types.yaml` konfiguriert. +Das Chunking ist profilbasiert und bezieht seine Konfiguration dynamisch aus der `TypeRegistry`. -### 3.1 Profile und Strategien (Vollständige Referenz) +### 3.1 Profile und Strategien | Profil | Strategie | Parameter | Einsatzgebiet | | :--- | :--- | :--- | :--- | -| `sliding_short` | `sliding_window` | Max: 350, Target: 200 | Kurze Logs, Chats, Risiken. | -| `sliding_standard` | `sliding_window` | Max: 650, Target: 450 | Massendaten (Journal, Quellen). | -| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte mit hohem Wert (Projekte). | -| `structured_smart_edges` | `by_heading` | `strict: false` (Soft) | Strukturierte Texte, Merging erlaubt. | -| `structured_smart_edges_strict` | `by_heading` | `strict: true` (Hard) | **Atomare Einheiten**: Entscheidungen, Werte. | -| `structured_smart_edges_strict_L3`| `by_heading` | `strict: true`, `level: 3` | Tief geschachtelte Prinzipien (Tier 2/MP1). | +| `sliding_short` | `sliding_window` | Max: 350, Target: 200 | Kurze Logs, Chats. | +| `sliding_standard` | `sliding_window` | Max: 650, Target: 450 | Standard-Wissen. | +| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte (Projekte). | +| `structured_smart_edges` | `by_heading` | `strict: false` | Strukturierte Texte. | ### 3.2 Die `by_heading` Logik (v2.9 Hybrid) -Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). Sie unterstützt seit v2.9 ein "Safety Net" gegen zu große Chunks. +Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). Sie unterstützt ein "Safety Net" gegen zu große Chunks. * **Split Level:** Definiert die Tiefe (z.B. `2` = H1 & H2 triggern Split). * **Modus "Strict" (`strict_heading_split: true`):** @@ -126,12 +123,6 @@ Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). * **Füll-Logik:** Überschriften *auf* dem Split-Level lösen nur dann einen neuen Chunk aus, wenn der aktuelle Chunk die `target`-Größe erreicht hat. * *Safety Net:* Auch hier greift das `max` Token Limit. -### 3.3 Payload-Felder (Qdrant) - -* `text`: Der reine Inhalt (Anzeige im UI). -* `window`: Inhalt plus Overlap (für Embedding). -* `chunk_profile`: Das effektiv genutzte Profil (zur Nachverfolgung). - --- ## 4. Edge-Erzeugung & Prioritäten (Provenance) @@ -143,7 +134,7 @@ Kanten werden nach Vertrauenswürdigkeit (`provenance`) priorisiert. Die höhere | **1** | Wikilink | `explicit:wikilink` | **1.00** | Harte menschliche Setzung. | | **2** | Inline | `inline:rel` | **0.95** | Typisierte menschliche Kante. | | **3** | Callout | `callout:edge` | **0.90** | Explizite Meta-Information. | -| **4** | Semantic AI | `semantic_ai` | **0.90** | KI-extrahierte Verbindung (Mistral-safe). | +| **4** | Semantic AI | `semantic_ai` | **0.90** | KI-validiert gegen LocalBatchCache. | | **5** | Type Default | `edge_defaults` | **0.70** | Heuristik aus der Registry. | | **6** | Struktur | `structure` | **1.00** | System-interne Verkettung (`belongs_to`). | @@ -151,18 +142,8 @@ Kanten werden nach Vertrauenswürdigkeit (`provenance`) priorisiert. Die höhere ## 5. Quality Gates & Monitoring -In v2.7+ wurden Tools zur Überwachung der Datenqualität integriert: +**1. Registry Review (WP-14):** Prüfung der `data/logs/unknown_edges.jsonl`. Die zentrale Auflösung via `registry.py` verhindert Inkonsistenzen zwischen Import und Retrieval. -**1. Registry Review:** Prüfung der `data/logs/unknown_edges.jsonl`. Administratoren sollten hier gelistete Begriffe als Aliase in die `01_edge_vocabulary.md` aufnehmen. +**2. Mistral-safe Parsing:** Automatisierte Bereinigung von LLM-Antworten in `ingestion_validation.py`. Stellt sicher, dass semantische Entscheidungen ("YES"/"NO") nicht durch technische Header verfälscht werden. -**2. Payload Dryrun (Schema-Check):** -Simuliert Import, prüft JSON-Schema Konformität. -```bash -python3 -m scripts.payload_dryrun --vault ./test_vault -``` - -**3. Full Edge Check (Graph-Integrität):** -Prüft Invarianten (z.B. `next` muss reziprok zu `prev` sein). -```bash -python3 -m scripts.edges_full_check -``` \ No newline at end of file +**3. Purge Integrity:** Validierung, dass vor jedem Upsert alle assoziierten Artefakte in den Collections `{prefix}_chunks` und `{prefix}_edges` gelöscht wurden, um Daten-Duplikate zu vermeiden. \ No newline at end of file diff --git a/docs/03_Technical_References/03_tech_retrieval_scoring.md b/docs/03_Technical_References/03_tech_retrieval_scoring.md index f1a4bc7..b2cb15d 100644 --- a/docs/03_Technical_References/03_tech_retrieval_scoring.md +++ b/docs/03_Technical_References/03_tech_retrieval_scoring.md @@ -3,13 +3,13 @@ doc_type: technical_reference audience: developer, data_scientist scope: backend, retrieval, scoring, modularization status: active -version: 2.7.1 -context: "Detaillierte Dokumentation der Scoring-Algorithmen, inklusive WP-22 Lifecycle-Modifier, Intent-Boosting und Modularisierung." +version: 2.9.0 +context: "Detaillierte Dokumentation der Scoring-Algorithmen, inklusive WP-22 Lifecycle-Modifier, Intent-Boosting und WP-14 Modularisierung." --- # Retrieval & Scoring Algorithmen -Der Retriever unterstützt **Semantic Search** und **Hybrid Search**. Seit v2.4 nutzt Mindnet ein gewichtetes Scoring-Modell, das Semantik, Graphentheorie und Metadaten kombiniert. Mit Version 2.7 (WP-22) wurde dieses Modell um **Lifecycle-Faktoren** und **Intent-Boosting** erweitert sowie die Architektur modularisiert. +Der Retriever unterstützt **Semantic Search** und **Hybrid Search**. Seit v2.4 nutzt Mindnet ein gewichtetes Scoring-Modell, das Semantik, Graphentheorie und Metadaten kombiniert. Mit Version 2.7 (WP-22) wurde dieses Modell um **Lifecycle-Faktoren** und **Intent-Boosting** erweitert sowie die Architektur modularisiert (WP-14). ## 1. Scoring Formel (v2.7.0) @@ -37,18 +37,19 @@ $$ * **Zweck:** Belohnt Chunks, die "im Thema" vernetzt sind. **4. Centrality Bonus ($B_{cent}$):** -* **Kontext:** Berechnet im lokalen Subgraphen. +* **Kontext:** Berechnet im lokalen Subgraphen via `graph_subgraph.centrality_bonus`. * **Logik:** Vereinfachte PageRank-Metrik (Degree Centrality). * **Zweck:** Belohnt "Hubs" mit vielen Verbindungen zu anderen Treffern. ### Die WP-22 Erweiterungen (v2.7.0) **5. Status Modifier ($M_{status}$):** -* **Herkunft:** Feld `status` aus dem Frontmatter. +* **Herkunft:** Feld `status` aus dem Frontmatter (verarbeitet in `retriever_scoring.get_status_multiplier`). * **Zweck:** Bestraft unfertiges Wissen (Drafts) oder bevorzugt stabiles Wissen. -* **Werte (Auftrag WP-22):** * `stable`: **1.2** (Bonus für Qualität). - * `draft`: **0.5** (Malus für Entwürfe). - * `system`: Exkludiert (siehe Ingestion). +* **Werte (Auftrag WP-22):** * `stable`: **1.2** (Belohnung für verifiziertes Wissen). + * `active`: **1.0** (Standard-Gewichtung). + * `draft`: **0.5** (Malus für unfertige Fragmente). + * `system`: Exkludiert (siehe Ingestion Lifecycle Filter). **6. Intent Boost ($B_{intent}$):** * **Herkunft:** Dynamische Injektion durch die Decision Engine basierend auf der Nutzerfrage. @@ -56,47 +57,61 @@ $$ --- -## 2. Hybrid Retrieval Flow & Modularisierung +## 2. Hybrid Retrieval Flow & Modularisierung (WP-14) -In v2.7 wurde die Engine in einen Orchestrator (`retriever.py`) und eine Scoring-Engine (`retriever_scoring.py`) aufgeteilt. +Seit v2.9 ist die Retrieval-Engine im spezialisierten Paket `app.core.retrieval` gekapselt. Die Zuständigkeiten sind strikt zwischen Orchestrierung und mathematischer Bewertung getrennt. **Phase 1: Vector Search (Seed Generation)** -* Der Orchestrator sucht Top-K (Standard: 20) Kandidaten via Embeddings in Qdrant. +* Der Orchestrator (`retriever.py`) sucht Top-K (Standard: 20) Kandidaten via Embeddings in Qdrant über das modularisierte `app.core.database` Paket. * Diese bilden die "Seeds" für den Graphen. **Phase 2: Graph Expansion** -* Nutze `graph_adapter.expand(seeds, depth=1)`. -* Lade direkte Nachbarn aus der `_edges` Collection. -* Konstruiere einen `NetworkX`-Graphen im Speicher. +* Nutze die Fassade `app.core.graph_adapter.expand(seeds, depth=1)`. +* Diese delegiert an `app.core.graph.graph_subgraph`, um direkte Nachbarn aus der `_edges` Collection zu laden. +* Konstruktion eines in-memory Graphen zur Berechnung topologischer Boni. **Phase 3: Re-Ranking (Modular)** -* Der Orchestrator übergibt den Graphen und die Seeds an die `ScoringEngine`. -* Berechne Boni ($B_{edge}$, $B_{cent}$) sowie die neuen Lifecycle- und Intent-Modifier. -* Sortierung absteigend nach `TotalScore` und Limitierung auf Top-Resultate (z.B. 5). +* Der Orchestrator übergibt den Graphen und die Seeds an die `ScoringEngine` (`retriever_scoring.py`). +* Berechnung der finalen Scores unter Berücksichtigung von $B_{edge}$, $B_{cent}$ sowie der Lifecycle- und Intent-Modifier. +* Sortierung absteigend nach `TotalScore` und Limitierung auf die angeforderten Top-Resultate. --- ## 3. Explanation Layer (WP-22 Update) -Bei `explain=True` generiert das System eine detaillierte Begründung. +Bei `explain=True` generiert das System eine detaillierte Begründung inklusive Provenienz-Informationen. **Erweiterte JSON-Struktur:** ```json { "score_breakdown": { - "semantic": 0.85, - "type_boost": 1.0, - "lifecycle_modifier": 0.5, - "edge_bonus": 0.4, - "intent_boost": 0.5, - "centrality": 0.1 + "semantic_contribution": 0.85, + "edge_contribution": 0.4, + "centrality_contribution": 0.1, + "raw_semantic": 0.85, + "raw_edge_bonus": 0.3, + "raw_centrality": 0.1, + "node_weight": 1.0, + "status_multiplier": 1.2, + "graph_boost_factor": 1.5 }, "reasons": [ - "Hohe textuelle Übereinstimmung (>0.85).", - "Status 'draft' reduziert Relevanz (Modifier 0.5).", - "Wird referenziert via 'caused_by' (Intent-Bonus 0.5).", - "Bevorzugt, da Typ 'decision' (Gewicht 1.0)." + { + "kind": "semantic", + "message": "Hohe textuelle Übereinstimmung (>0.85).", + "score_impact": 0.85 + }, + { + "kind": "type", + "message": "Bevorzugt durch Typ-Profil.", + "score_impact": 0.1 + }, + { + "kind": "edge", + "message": "Bestätigte Kante 'caused_by' [Boost x1.5] von 'Note-A'.", + "score_impact": 0.4 + } ] } ``` @@ -105,18 +120,18 @@ Bei `explain=True` generiert das System eine detaillierte Begründung. ## 4. Konfiguration (`retriever.yaml`) -Steuert die Gewichtung der mathematischen Komponenten. +Steuert die globale Gewichtung der mathematischen Komponenten. ```yaml scoring: - semantic_weight: 1.0 # Basis-Relevanz - edge_weight: 0.7 # Graphen-Einfluss - centrality_weight: 0.5 # Hub-Einfluss + semantic_weight: 1.0 # Basis-Relevanz (W_sem) + edge_weight: 0.7 # Graphen-Einfluss (W_edge) + centrality_weight: 0.5 # Hub-Einfluss (W_cent) -# WP-22 Lifecycle Konfiguration (Abgleich mit Auftrag) +# WP-22 Lifecycle Konfiguration lifecycle_weights: - stable: 1.2 # Bonus für Qualität - draft: 0.5 # Malus für Entwürfe + stable: 1.2 # Modifier für Qualität + draft: 0.5 # Modifier für Entwürfe # Kanten-Gewichtung für den Edge-Bonus (Basis) edge_weights: diff --git a/docs/05_Development/05_developer_guide.md b/docs/05_Development/05_developer_guide.md index 17ed425..831f285 100644 --- a/docs/05_Development/05_developer_guide.md +++ b/docs/05_Development/05_developer_guide.md @@ -1,10 +1,10 @@ --- doc_type: developer_guide audience: developer -scope: workflow, testing, architecture, modules +scope: workflow, testing, architecture, modules, modularization status: active -version: 2.6.1 -context: "Umfassender Guide für Entwickler: Architektur, Modul-Interna (Deep Dive), Setup, Git-Workflow und Erweiterungs-Anleitungen." +version: 2.9.1 +context: "Umfassender Guide für Entwickler: Modularisierte Architektur (WP-14), Two-Pass Ingestion (WP-15b), Modul-Interna, Setup und Git-Workflow." --- # Mindnet Developer Guide & Workflow @@ -23,8 +23,6 @@ Dieser Guide ist die zentrale technische Referenz für Mindnet v2.6. Er vereint - [Kern-Philosophie](#kern-philosophie) - [2. Architektur](#2-architektur) - [2.1 High-Level Übersicht](#21-high-level-übersicht) - - [2.2 Datenfluss-Muster](#22-datenfluss-muster) - - [A. Ingestion (Write)](#a-ingestion-write) - [B. Retrieval (Read)](#b-retrieval-read) - [C. Visualisierung (Graph)](#c-visualisierung-graph) - [3. Physische Architektur](#3-physische-architektur) @@ -84,23 +82,28 @@ graph TD API["main.py"] RouterChat["Chat / RAG"] RouterIngest["Ingest / Write"] - CoreRet["Retriever Engine"] - CoreIngest["Ingestion Pipeline"] + + subgraph "Core Packages (WP-14)" + PkgRet["retrieval/ (Search)"] + PkgIng["ingestion/ (Import)"] + PkgGra["graph/ (Logic)"] + PkgDb["database/ (Infrastr.)"] + Registry["registry.py (Neutral)"] + end end subgraph "Infrastructure & Services" - LLM["Ollama (Phi3/Nomic)"] + LLM["Ollama / Cloud (Hybrid)"] DB[("Qdrant Vector DB")] FS["File System (.md)"] end User <--> UI - UI -- "REST (Chat, Save, Feedback)" --> API - UI -. "Direct Read (Graph Viz Performance)" .-> DB - API -- "Embeddings & Completion" --> LLM - API -- "Read/Write" --> DB - API -- "Read/Write (Source of Truth)" --> FS -``` + UI -- "REST Call" --> API + PkgRet -- "Direct Query" --> PkgDb + PkgIng -- "Process & Write" --> PkgDb + PkgDb -- "API" --> DB + API -- "Inference" --> LLM``` ### 2.2 Datenfluss-Muster @@ -108,14 +111,12 @@ graph TD Vom Markdown zur Vektor-Datenbank. ```mermaid graph LR - MD["Markdown File"] --> Parser("Parser") - Parser --> Chunker("Chunker") - Chunker -- "Text Chunks" --> SemAn{"SemanticAnalyzer
(LLM)"} - SemAn -- "Smart Edges" --> Embedder("Embedder") - Embedder --> DB[("Qdrant
Points")] - - style DB fill:#f9f,stroke:#333,stroke-width:2px - style SemAn fill:#ff9,stroke:#333,stroke-width:2px + MD["Markdown File"] --> Pass1["Pass 1: Pre-Scan"] + Pass1 --> Cache[("LocalBatchCache
(Titles/Summaries)")] + MD --> Pass2["Pass 2: Processing"] + Cache -- "Context" --> SmartEdges{"Smart Edge
Validation"} + SmartEdges --> Embedder("Embedder") + Embedder --> DB[("Qdrant Points")] ``` #### B. Retrieval (Read) @@ -123,17 +124,10 @@ Die hybride Suche für Chat & RAG. ```mermaid graph LR Query(["Query"]) --> Embed("Embedding") - Embed --> Hybrid{"Hybrid Search"} - - subgraph Search Components - Vec["Vector Score"] - Graph["Graph/Edge Bonus"] - end - - Vec --> Hybrid - Graph --> Hybrid - - Hybrid --> Rank("Re-Ranking") + Embed --> Seed["Seed Search (Vector)"] + Seed --> Expand{"Graph Expansion"} + Expand --> Scoring["Scoring Engine (WP-22)"] + Scoring --> Rank("Final Ranking") Rank --> Ctx["LLM Context"] ``` @@ -170,6 +164,12 @@ Das System ist modular aufgebaut. Hier ist die detaillierte Analyse aller Kompon mindnet/ ├── app/ │ ├── core/ # Business Logic & Algorithms +│ │ ├── database/ # WP-14: Qdrant Client & Point Mapping +│ │ ├── ingestion/ # WP-14: Pipeline, Multi-Hash, Validation +│ │ ├── retrieval/ # WP-14: Search Orchestrator & Scoring +│ │ ├── graph/ # WP-14: Subgraph-Logik & Weights +│ │ ├── registry.py # SSOT: Circular Import Fix & Text Cleanup +│ │ └── *.py (Proxy) # Legacy Bridges für Abwärtskompatibilität │ ├── routers/ # API Interface (FastAPI) │ ├── services/ # External Integrations (LLM, DB) │ ├── models/ # Pydantic DTOs @@ -285,6 +285,8 @@ Folgende Dateien wurden im Audit v2.6 als veraltet, redundant oder "Zombie-Code" | `app/core/type_registry.py` | **Redundant.** Logik in `ingestion.py` integriert. | 🗑️ Löschen | | `app/core/env_vars.py` | **Veraltet.** Ersetzt durch `config.py`. | 🗑️ Löschen | | `app/services/llm_ollama.py` | **Veraltet.** Ersetzt durch `llm_service.py`. | 🗑️ Löschen | +| `app/core/type_registry.py` | **Redundant.** Logik in `app/core/registry.py` integriert. | 🗑️ Löschen | +| `app/core/ranking.py` | **Redundant.** Logik in `retrieval/retriever_scoring.py` integriert. | 🗑️ Löschen | --- diff --git a/docs/06_Roadmap/06_active_roadmap.md b/docs/06_Roadmap/06_active_roadmap.md index 59df0a0..7b9be49 100644 --- a/docs/06_Roadmap/06_active_roadmap.md +++ b/docs/06_Roadmap/06_active_roadmap.md @@ -2,18 +2,14 @@ doc_type: roadmap audience: product_owner, developer status: active -version: 2.8.0 -context: "Aktuelle Planung für kommende Features (ab WP16), Release-Strategie und Historie der abgeschlossenen WPs." +version: 2.9.1 +context: "Aktuelle Planung für kommende Features (ab WP16), Release-Strategie und Historie der abgeschlossenen WPs nach WP-14/15b." --- # Mindnet Active Roadmap -**Aktueller Stand:** v2.8.0 (Post-WP20/WP76) -**Fokus:** Visualisierung, Exploration & Cloud-Resilienz. - -## 1. Programmstatus - -Wir haben mit der Implementierung des Graph Explorers (WP19), der Smart Edge Allocation (WP15) und der hybriden Cloud-Resilienz (WP20) die Basis für ein intelligentes, robustes System gelegt. Der nächste Schritt (WP19a) vertieft die Analyse, während WP16 die "Eingangs-Intelligenz" erhöht. +**Aktueller Stand:** v2.9.1 (Post-WP14 / WP-15b) +**Fokus:** Modularisierung, Two-Pass Ingestion & Graph Intelligence. | Phase | Fokus | Status | | :--- | :--- | :--- | @@ -45,6 +41,8 @@ Eine Übersicht der implementierten Features zum schnellen Auffinden von Funktio | **WP-10** | Web UI | Streamlit-Frontend als Ersatz für das Terminal. | | **WP-10a**| Draft Editor | GUI-Komponente zum Bearbeiten und Speichern generierter Notizen. | | **WP-11** | Backend Intelligence | `nomic-embed-text` (768d) und Matrix-Logik für Kanten-Typisierung. | +| **WP-14** | **Modularisierung & Refactoring** | **Ergebnis:** Aufteilung in domänenspezifische Pakete (`database`, `ingestion`, `retrieval`, `graph`). Implementierung von Proxy-Adaptern für Abwärtskompatibilität und `registry.py` zur Lösung von Zirkelbezügen. | +| **WP-15b**| **Candidate-Based Validation** | **Ergebnis:** Implementierung des **Two-Pass Workflows**. Einführung des `LocalBatchCache` und binäre semantische Validierung von Kanten-Kandidaten zur Vermeidung von Halluzinationen. | | **WP-15** | Smart Edge Allocation | LLM-Filter für Kanten in Chunks + Traffic Control (Semaphore) + Strict Chunking. | | **WP-19** | Graph Visualisierung | **Frontend Modularisierung:** Umbau auf `ui_*.py`.
**Graph Engines:** Parallelbetrieb von Cytoscape (COSE) und Agraph.
**Tools:** "Single Source of Truth" Editor, Persistenz via URL. | | **WP-20** | **Cloud Hybrid Mode & Resilienz** | **Ergebnis:** Integration von OpenRouter (Mistral 7B) & Gemini 2.5 Lite. Implementierung von WP-76 (Rate-Limit Wait) & Mistral-safe JSON Parsing. | @@ -59,6 +57,10 @@ Eine Übersicht der implementierten Features zum schnellen Auffinden von Funktio * **Quoten-Management:** Die Nutzung von Free-Tier Modellen (Mistral/OpenRouter) erfordert zwingend eine intelligente Rate-Limit-Erkennung (HTTP 429) mit automatisierten Wartezyklen, um Batch-Prozesse stabil zu halten. * **Parser-Robustheit:** Cloud-Modelle betten JSON oft in technische Steuerzeichen (``, `[OUT]`) ein. Ein robuster Extraktor mit Recovery-Logik ist essentiell zur Vermeidung von Datenverlust. +### 2.3 WP-14 & WP-15b Lessons Learned +* **Performance:** Der Pre-Scan (Pass 1) ist minimal invasiv, ermöglicht aber in Pass 2 eine drastische Reduktion der LLM-Kosten, da nur noch binär validiert werden muss, anstatt komplexe Extraktionen durchzuführen. +* **Wartbarkeit:** Durch die Paket-Struktur können DB-Adapter (z.B. für Qdrant) nun unabhängig von der Business-Logik (Scoring) aktualisiert werden. +* --- ## 3. Offene Workpackages (Planung) @@ -93,6 +95,20 @@ Diese Features stehen als nächstes an oder befinden sich in der Umsetzung. - Aufwand: Mittel - Komplexität: Niedrig/Mittel + + +### WP-13 – MCP-Integration & Agenten-Layer +**Status:** 🟡 Geplant +**Ziel:** mindnet als MCP-Server bereitstellen, damit Agenten (Claude Desktop, OpenAI) standardisierte Tools nutzen können. +* **Umfang:** MCP-Server mit Tools (`mindnet_query`, `mindnet_explain`, etc.). + +### WP-14 – Review / Refactoring / Dokumentation +**Status:** 🟡 Laufend (Phase E) +**Ziel:** Technische Schulden abbauen, die durch schnelle Feature-Entwicklung (WP15/WP19) entstanden sind. +* **Refactoring `chunker.py`:** Die Datei ist monolithisch geworden (Parsing, Strategien, LLM-Orchestrierung). + * *Lösung:* Aufteilung in ein Package `app/core/chunking/` mit Modulen (`strategies.py`, `orchestration.py`, `utils.py`). +* **Dokumentation:** Kontinuierliche Synchronisation von Code und Docs (v2.8 Stand). + ### WP-15b – Candidate-Based Edge Validation & Inheritance **Phase:** B/E (Refactoring & Semantic) **Status:** 🚀 Startklar (Ersatz für WP-15 Logik) @@ -113,19 +129,6 @@ Der bisherige WP-15 Ansatz litt unter Halluzinationen (erfundene Kantentypen), h * **Chunker-Update:** Implementierung einer `propagate_edges`-Logik für "by_heading" und "sliding_window" Strategien. * **Ingestion-Update:** Umstellung von `_perform_smart_edge_allocation` auf einen binären Validierungs-Prompt (VALID/INVALID). -### WP-19a – Graph Intelligence & Discovery (Sprint-Fokus) -**Status:** 🚀 Startklar -**Ziel:** Vom "Anschauen" zum "Verstehen". Deep-Dive Werkzeuge für den Graphen. -* **Discovery Screen:** Neuer Tab für semantische Suche ("Finde Notizen über Vaterschaft") und Wildcard-Filter. -* **Filter-Logik:** "Zeige nur Wege, die zu `type:decision` führen". -* **Chunk Inspection:** Umschaltbare Granularität (Notiz vs. Chunk) zur Validierung des Smart Chunkers. - -### WP-14 – Review / Refactoring / Dokumentation -**Status:** 🟡 Laufend (Phase E) -**Ziel:** Technische Schulden abbauen, die durch schnelle Feature-Entwicklung (WP15/WP19) entstanden sind. -* **Refactoring `chunker.py`:** Die Datei ist monolithisch geworden (Parsing, Strategien, LLM-Orchestrierung). - * *Lösung:* Aufteilung in ein Package `app/core/chunking/` mit Modulen (`strategies.py`, `orchestration.py`, `utils.py`). -* **Dokumentation:** Kontinuierliche Synchronisation von Code und Docs (v2.8 Stand). ### WP-16 – Auto-Discovery & Intelligent Ingestion **Status:** 🟡 Geplant @@ -153,10 +156,13 @@ Der bisherige WP-15 Ansatz litt unter Halluzinationen (erfundene Kantentypen), h * **Feature:** Cronjob `check_graph_integrity.py`. * **Funktion:** Findet "Dangling Edges" (Links auf gelöschte Notizen) und repariert/löscht sie. -### WP-13 – MCP-Integration & Agenten-Layer -**Status:** 🟡 Geplant -**Ziel:** mindnet als MCP-Server bereitstellen, damit Agenten (Claude Desktop, OpenAI) standardisierte Tools nutzen können. -* **Umfang:** MCP-Server mit Tools (`mindnet_query`, `mindnet_explain`, etc.). +### WP-19a – Graph Intelligence & Discovery (Sprint-Fokus) +**Status:** 🚀 Startklar +**Ziel:** Vom "Anschauen" zum "Verstehen". Deep-Dive Werkzeuge für den Graphen. +* **Discovery Screen:** Neuer Tab für semantische Suche ("Finde Notizen über Vaterschaft") und Wildcard-Filter. +* **Filter-Logik:** "Zeige nur Wege, die zu `type:decision` führen". +* **Chunk Inspection:** Umschaltbare Granularität (Notiz vs. Chunk) zur Validierung des Smart Chunkers. + ### WP-21 – Semantic Graph Routing & Canonical Edges **Status:** 🟡 Geplant diff --git a/docs/99_Archive/99_legacy_workpackages.md b/docs/99_Archive/99_legacy_workpackages.md index 7eed15f..dccb8da 100644 --- a/docs/99_Archive/99_legacy_workpackages.md +++ b/docs/99_Archive/99_legacy_workpackages.md @@ -91,4 +91,23 @@ Dieses Dokument dient als Referenz für die Entstehungsgeschichte von Mindnet v2 * **Modularisierung:** Aufsplittung der `ui.py` in Router, Services und Views (`ui_*.py`). * **Graph Explorer:** Einführung von `st-cytoscape` für stabile, nicht-überlappende Layouts (COSE) als Ergänzung zur Legacy-Engine (Agraph). * **Single Source of Truth:** Der Editor lädt Inhalte nun direkt vom Dateisystem statt aus (potenziell veralteten) Vektor-Payloads. - * **UX:** Einführung von URL-Persistenz für Layout-Settings und CSS-basiertes Highlighting zur Vermeidung von Re-Renders. \ No newline at end of file + * **UX:** Einführung von URL-Persistenz für Layout-Settings und CSS-basiertes Highlighting zur Vermeidung von Re-Renders. + + +## Phase E+: Architektur-Konsolidierung (WP-14) + +### WP-14 – Modularisierung & Paket-Struktur +* **Ziel:** Auflösung technischer Schulden und Beseitigung von Zirkelbezügen (Circular Imports). +* **Ergebnis:** + * **Domänen-Pakete:** Aufteilung der monolithischen `app/core/` Struktur in spezialisierte Pakete: `database/`, `ingestion/`, `retrieval/` und `graph/`. + * **Proxy-Pattern:** Einsatz von Fassaden-Modulen (z. B. `graph_adapter.py`) zur Aufrechterhaltung der Abwärtskompatibilität für bestehende API-Endpunkte. + * **Registry-Zentralisierung:** Auslagerung neutraler Hilfsfunktionen (wie `clean_llm_text`) in eine unabhängige `registry.py`, um Abhängigkeitsschleifen zwischen Diensten zu brechen. +* **Tech:** Einführung von `__init__.py` Exporten zur Definition sauberer Paket-Schnittstellen. + +### WP-15b – Two-Pass Ingestion & Candidate Validation +* **Problem:** Die ursprüngliche Smart Edge Extraktion (WP-15) war teuer und neigte zu Halluzinationen, da sie ohne globalen Kontext operierte. +* **Lösung:** Implementierung eines **Two-Pass Workflows**. + * **Pass 1 (Pre-Scan):** Schnelles Einlesen aller Notizen zur Erstellung eines `LocalBatchCache` (Metadaten & Summaries). + * **Pass 2 (Processing):** Gezielte semantische Verarbeitung nur für geänderte Dateien. +* **Feature:** **Binary Validation Gate**. Statt Kanten frei zu erfinden, validiert das LLM nun Kanten-Kandidaten aus einem Pool gegen den Kontext des `LocalBatchCache`. Dies garantiert 100% Konformität mit der Edge Registry. +* **Ergebnis:** Höhere Geschwindigkeit durch Reduktion komplexer LLM-Prompts auf binäre Entscheidungen (VALID/INVALID). \ No newline at end of file