WP13b Refactoring ingestion und Chunker
This commit is contained in:
parent
cf302e8334
commit
94e5ebf577
|
|
@ -1,393 +1,36 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunker.py
|
FILE: app/core/chunker.py
|
||||||
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings).
|
DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher.
|
||||||
WP-15b: Implementiert Edge-Inheritance und Candidate-Pool Vorbereitung.
|
WP-14: Modularisierung abgeschlossen.
|
||||||
Zentralisiert die Kanten-Vorbereitung für die spätere binäre Validierung.
|
WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert.
|
||||||
Bietet volle Unterstützung für Hybrid-Chunking (Strict/Soft/Safety-Net).
|
Verwendet neue 'chunking_' Präfixe für Untermodule.
|
||||||
VERSION: 3.2.0
|
VERSION: 3.3.0
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: re, math, yaml, pathlib, asyncio, logging
|
|
||||||
"""
|
"""
|
||||||
|
import asyncio
|
||||||
from __future__ import annotations
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import List, Dict, Optional, Tuple, Any, Set
|
|
||||||
import re
|
import re
|
||||||
import math
|
|
||||||
import yaml
|
|
||||||
from pathlib import Path
|
|
||||||
import asyncio
|
|
||||||
import logging
|
import logging
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
# Services
|
# Interne Package-Imports mit neuer Präfix-Konvention
|
||||||
# In WP-15b wird die KI-Validierung in die ingestion.py verlagert.
|
from .chunking.chunking_models import Chunk, RawBlock
|
||||||
# Wir behalten den Import für Abwärtskompatibilität, falls Legacy-Skripte ihn benötigen.
|
from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||||
|
from .chunking.chunking_parser import parse_blocks, parse_edges_robust
|
||||||
|
from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading
|
||||||
|
from .chunking.chunking_propagation import propagate_section_edges
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Legacy Support für SemanticAnalyzer (Optional für andere Skripte)
|
||||||
try:
|
try:
|
||||||
from app.services.semantic_analyzer import get_semantic_analyzer
|
from app.services.semantic_analyzer import get_semantic_analyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
def get_semantic_analyzer(): return None
|
def get_semantic_analyzer(): return None
|
||||||
|
|
||||||
# Core Imports
|
|
||||||
try:
|
|
||||||
from app.core.derive_edges import build_edges_for_note
|
|
||||||
except ImportError:
|
|
||||||
# Fallback für Standalone-Betrieb oder Tests
|
|
||||||
def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 1. HELPER & CONFIG
|
|
||||||
# ==========================================
|
|
||||||
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
|
||||||
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
|
|
||||||
# Fallback Default, falls types.yaml fehlt
|
|
||||||
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
|
|
||||||
_CONFIG_CACHE = None
|
|
||||||
|
|
||||||
def _load_yaml_config() -> Dict[str, Any]:
|
|
||||||
global _CONFIG_CACHE
|
|
||||||
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
|
|
||||||
if not CONFIG_PATH.exists(): return {}
|
|
||||||
try:
|
|
||||||
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
|
||||||
data = yaml.safe_load(f)
|
|
||||||
_CONFIG_CACHE = data
|
|
||||||
return data
|
|
||||||
except Exception: return {}
|
|
||||||
|
|
||||||
def get_chunk_config(note_type: str) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml.
|
|
||||||
Sichert die Kompatibilität zu WP-15 Profilen.
|
|
||||||
"""
|
|
||||||
full_config = _load_yaml_config()
|
|
||||||
profiles = full_config.get("chunking_profiles", {})
|
|
||||||
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
|
||||||
|
|
||||||
# Welches Profil nutzt dieser Typ? (z.B. 'sliding_smart_edges')
|
|
||||||
profile_name = type_def.get("chunking_profile")
|
|
||||||
|
|
||||||
if not profile_name:
|
|
||||||
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
|
||||||
|
|
||||||
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
|
||||||
|
|
||||||
# Tupel-Konvertierung für Overlap (YAML liest oft Listen)
|
|
||||||
if "overlap" in config and isinstance(config["overlap"], list):
|
|
||||||
config["overlap"] = tuple(config["overlap"])
|
|
||||||
|
|
||||||
return config
|
|
||||||
|
|
||||||
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
|
|
||||||
"""Trennt YAML-Frontmatter vom eigentlichen Text."""
|
|
||||||
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
|
|
||||||
if not fm_match: return {}, md_text
|
|
||||||
try:
|
|
||||||
frontmatter = yaml.safe_load(fm_match.group(1))
|
|
||||||
if not isinstance(frontmatter, dict): frontmatter = {}
|
|
||||||
except yaml.YAMLError:
|
|
||||||
frontmatter = {}
|
|
||||||
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
|
|
||||||
return frontmatter, text_without_fm.strip()
|
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 2. DATA CLASSES & TEXT TOOLS
|
|
||||||
# ==========================================
|
|
||||||
|
|
||||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
|
||||||
_WS = re.compile(r'\s+')
|
|
||||||
|
|
||||||
def estimate_tokens(text: str) -> int:
|
|
||||||
"""Grobe Schätzung der Token-Anzahl (4 Zeichen pro Token)."""
|
|
||||||
return max(1, math.ceil(len(text.strip()) / 4))
|
|
||||||
|
|
||||||
def split_sentences(text: str) -> list[str]:
|
|
||||||
"""Teilt Text in Sätze auf unter Berücksichtigung von Interpunktion."""
|
|
||||||
text = _WS.sub(' ', text.strip())
|
|
||||||
if not text: return []
|
|
||||||
parts = _SENT_SPLIT.split(text)
|
|
||||||
return [p.strip() for p in parts if p.strip()]
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class RawBlock:
|
|
||||||
kind: str
|
|
||||||
text: str
|
|
||||||
level: Optional[int]
|
|
||||||
section_path: str
|
|
||||||
section_title: Optional[str]
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Chunk:
|
|
||||||
id: str
|
|
||||||
note_id: str
|
|
||||||
index: int
|
|
||||||
text: str
|
|
||||||
window: str
|
|
||||||
token_count: int
|
|
||||||
section_title: Optional[str]
|
|
||||||
section_path: str
|
|
||||||
neighbors_prev: Optional[str]
|
|
||||||
neighbors_next: Optional[str]
|
|
||||||
# WP-15b: Liste von Kandidaten für die semantische Validierung
|
|
||||||
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
|
||||||
suggested_edges: Optional[List[str]] = None
|
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 3. PARSING & STRATEGIES
|
|
||||||
# ==========================================
|
|
||||||
|
|
||||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|
||||||
"""
|
|
||||||
Zerlegt Text in logische Blöcke (Absätze, Header).
|
|
||||||
Wichtig für die Strategie 'by_heading' und die Edge-Inheritance.
|
|
||||||
"""
|
|
||||||
blocks = []
|
|
||||||
h1_title = "Dokument"
|
|
||||||
section_path = "/"
|
|
||||||
current_h2 = None
|
|
||||||
|
|
||||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
|
||||||
|
|
||||||
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
|
||||||
if h1_match:
|
|
||||||
h1_title = h1_match.group(1).strip()
|
|
||||||
|
|
||||||
lines = text_without_fm.split('\n')
|
|
||||||
buffer = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
stripped = line.strip()
|
|
||||||
if stripped.startswith('# '):
|
|
||||||
continue
|
|
||||||
elif stripped.startswith('## '):
|
|
||||||
if buffer:
|
|
||||||
content = "\n".join(buffer).strip()
|
|
||||||
if content:
|
|
||||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
|
||||||
buffer = []
|
|
||||||
current_h2 = stripped[3:].strip()
|
|
||||||
section_path = f"/{current_h2}"
|
|
||||||
blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
|
|
||||||
elif not stripped:
|
|
||||||
if buffer:
|
|
||||||
content = "\n".join(buffer).strip()
|
|
||||||
if content:
|
|
||||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
|
||||||
buffer = []
|
|
||||||
else:
|
|
||||||
buffer.append(line)
|
|
||||||
|
|
||||||
if buffer:
|
|
||||||
content = "\n".join(buffer).strip()
|
|
||||||
if content:
|
|
||||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
|
||||||
|
|
||||||
return blocks, h1_title
|
|
||||||
|
|
||||||
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
|
|
||||||
"""
|
|
||||||
Standard-Strategie aus WP-15.
|
|
||||||
Fasst Blöcke zusammen und schneidet bei 'target' Tokens.
|
|
||||||
"""
|
|
||||||
target = config.get("target", 400)
|
|
||||||
max_tokens = config.get("max", 600)
|
|
||||||
overlap_val = config.get("overlap", (50, 80))
|
|
||||||
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
|
|
||||||
chunks = []
|
|
||||||
buf = []
|
|
||||||
|
|
||||||
def _create_chunk(txt, win, sec, path):
|
|
||||||
idx = len(chunks)
|
|
||||||
chunks.append(Chunk(
|
|
||||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
|
||||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
|
||||||
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
|
|
||||||
candidate_pool=[]
|
|
||||||
))
|
|
||||||
|
|
||||||
def flush_buffer():
|
|
||||||
nonlocal buf
|
|
||||||
if not buf: return
|
|
||||||
|
|
||||||
text_body = "\n\n".join([b.text for b in buf])
|
|
||||||
sec_title = buf[-1].section_title if buf else None
|
|
||||||
sec_path = buf[-1].section_path if buf else "/"
|
|
||||||
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
|
|
||||||
|
|
||||||
if estimate_tokens(text_body) <= max_tokens:
|
|
||||||
_create_chunk(text_body, win_body, sec_title, sec_path)
|
|
||||||
else:
|
|
||||||
sentences = split_sentences(text_body)
|
|
||||||
current_chunk_sents = []
|
|
||||||
current_len = 0
|
|
||||||
|
|
||||||
for sent in sentences:
|
|
||||||
sent_len = estimate_tokens(sent)
|
|
||||||
if current_len + sent_len > target and current_chunk_sents:
|
|
||||||
c_txt = " ".join(current_chunk_sents)
|
|
||||||
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
|
|
||||||
_create_chunk(c_txt, c_win, sec_title, sec_path)
|
|
||||||
|
|
||||||
overlap_sents = []
|
|
||||||
ov_len = 0
|
|
||||||
for s in reversed(current_chunk_sents):
|
|
||||||
if ov_len + estimate_tokens(s) < overlap:
|
|
||||||
overlap_sents.insert(0, s)
|
|
||||||
ov_len += estimate_tokens(s)
|
|
||||||
else: break
|
|
||||||
|
|
||||||
current_chunk_sents = list(overlap_sents)
|
|
||||||
current_chunk_sents.append(sent)
|
|
||||||
current_len = ov_len + sent_len
|
|
||||||
else:
|
|
||||||
current_chunk_sents.append(sent)
|
|
||||||
current_len += sent_len
|
|
||||||
|
|
||||||
if current_chunk_sents:
|
|
||||||
c_txt = " ".join(current_chunk_sents)
|
|
||||||
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
|
|
||||||
_create_chunk(c_txt, c_win, sec_title, sec_path)
|
|
||||||
buf = []
|
|
||||||
|
|
||||||
for b in blocks:
|
|
||||||
if b.kind == "heading": continue
|
|
||||||
current_buf_text = "\n\n".join([x.text for x in buf])
|
|
||||||
if estimate_tokens(current_buf_text) + estimate_tokens(b.text) >= target:
|
|
||||||
flush_buffer()
|
|
||||||
buf.append(b)
|
|
||||||
if estimate_tokens(b.text) >= target:
|
|
||||||
flush_buffer()
|
|
||||||
|
|
||||||
flush_buffer()
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
|
||||||
"""
|
|
||||||
Hybrid-Strategie v2.9 (Strict/Soft/Safety-Net).
|
|
||||||
"""
|
|
||||||
strict = config.get("strict_heading_split", False)
|
|
||||||
target = config.get("target", 400)
|
|
||||||
max_tokens = config.get("max", 600)
|
|
||||||
split_level = config.get("split_level", 2)
|
|
||||||
|
|
||||||
chunks = []
|
|
||||||
current_buf = []
|
|
||||||
current_tokens = 0
|
|
||||||
|
|
||||||
def _flush(sec_title, sec_path):
|
|
||||||
nonlocal current_buf, current_tokens
|
|
||||||
if not current_buf: return
|
|
||||||
txt = "\n\n".join(current_buf)
|
|
||||||
win = f"# {doc_title}\n## {sec_title}\n{txt}".strip() if sec_title else txt
|
|
||||||
idx = len(chunks)
|
|
||||||
chunks.append(Chunk(
|
|
||||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
|
||||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
|
||||||
section_title=sec_title, section_path=sec_path,
|
|
||||||
neighbors_prev=None, neighbors_next=None,
|
|
||||||
candidate_pool=[]
|
|
||||||
))
|
|
||||||
current_buf = []
|
|
||||||
current_tokens = 0
|
|
||||||
|
|
||||||
for b in blocks:
|
|
||||||
if b.kind == "heading":
|
|
||||||
# Hierarchie-Check: Split bei Überschriften oberhalb des Split-Levels
|
|
||||||
if b.level < split_level:
|
|
||||||
_flush(b.section_title, b.section_path)
|
|
||||||
elif b.level == split_level:
|
|
||||||
if strict or current_tokens >= target:
|
|
||||||
_flush(b.section_title, b.section_path)
|
|
||||||
continue
|
|
||||||
|
|
||||||
block_tokens = estimate_tokens(b.text)
|
|
||||||
if current_tokens + block_tokens > max_tokens and current_buf:
|
|
||||||
_flush(b.section_title, b.section_path)
|
|
||||||
|
|
||||||
current_buf.append(b.text)
|
|
||||||
current_tokens += block_tokens
|
|
||||||
|
|
||||||
if current_buf:
|
|
||||||
last = blocks[-1] if blocks else None
|
|
||||||
_flush(last.section_title if last else None, last.section_path if last else "/")
|
|
||||||
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 4. ROBUST EDGE PARSING & PROPAGATION
|
|
||||||
# ==========================================
|
|
||||||
|
|
||||||
def _parse_edges_robust(text: str) -> Set[str]:
|
|
||||||
"""
|
|
||||||
Findet Kanten im Text (Wikilinks, Inlines, Callouts).
|
|
||||||
Fix V3: Support für mehrzeilige Callouts.
|
|
||||||
"""
|
|
||||||
found_edges = set()
|
|
||||||
|
|
||||||
# A. Inline [[rel:type|target]]
|
|
||||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
|
||||||
for kind, target in inlines:
|
|
||||||
k = kind.strip().lower()
|
|
||||||
t = target.strip()
|
|
||||||
if k and t: found_edges.add(f"{k}:{t}")
|
|
||||||
|
|
||||||
# B. Multiline Callouts Parsing (WP-15 Fix)
|
|
||||||
lines = text.split('\n')
|
|
||||||
current_edge_type = None
|
|
||||||
for line in lines:
|
|
||||||
stripped = line.strip()
|
|
||||||
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
|
||||||
if callout_match:
|
|
||||||
current_edge_type = callout_match.group(1).strip().lower()
|
|
||||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
|
||||||
for l in links:
|
|
||||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if current_edge_type and stripped.startswith('>'):
|
|
||||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
|
||||||
for l in links:
|
|
||||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
|
||||||
elif not stripped.startswith('>'):
|
|
||||||
current_edge_type = None
|
|
||||||
|
|
||||||
return found_edges
|
|
||||||
|
|
||||||
def _propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]:
|
|
||||||
"""
|
|
||||||
WP-15b: Implementiert Edge-Inheritance.
|
|
||||||
Kanten aus Überschriften werden an untergeordnete Chunks vererbt.
|
|
||||||
"""
|
|
||||||
section_inheritance: Dict[str, Set[str]] = {}
|
|
||||||
|
|
||||||
# 1. Sammeln aus den Heading-Blöcken
|
|
||||||
for b in blocks:
|
|
||||||
if b.kind == "heading":
|
|
||||||
edges = _parse_edges_robust(b.text)
|
|
||||||
if edges:
|
|
||||||
if b.section_path not in section_inheritance:
|
|
||||||
section_inheritance[b.section_path] = set()
|
|
||||||
section_inheritance[b.section_path].update(edges)
|
|
||||||
|
|
||||||
# 2. Injektion in den Candidate-Pool
|
|
||||||
for ch in chunks:
|
|
||||||
inherited = section_inheritance.get(ch.section_path, set())
|
|
||||||
for e_str in inherited:
|
|
||||||
kind, target = e_str.split(':', 1)
|
|
||||||
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"})
|
|
||||||
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
# ==========================================
|
|
||||||
# 5. ORCHESTRATION (WP-15b)
|
|
||||||
# ==========================================
|
|
||||||
|
|
||||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Hauptfunktion zur Chunk-Generierung.
|
Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten.
|
||||||
Baut den Candidate-Pool für die semantische Validierung auf.
|
Sichert die Kompatibilität zum bestehenden Ingestion-Prozess.
|
||||||
"""
|
"""
|
||||||
if config is None:
|
if config is None:
|
||||||
config = get_chunk_config(note_type)
|
config = get_chunk_config(note_type)
|
||||||
|
|
@ -395,51 +38,47 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||||
primary_strategy = config.get("strategy", "sliding_window")
|
primary_strategy = config.get("strategy", "sliding_window")
|
||||||
|
|
||||||
# 1. Parsing & Splitting
|
# 1. Parsing
|
||||||
blocks, doc_title = parse_blocks(md_text)
|
blocks, doc_title = parse_blocks(md_text)
|
||||||
|
|
||||||
|
# 2. Splitting via Thread-Offloading
|
||||||
if primary_strategy == "by_heading":
|
if primary_strategy == "by_heading":
|
||||||
chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
|
chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
|
||||||
else:
|
else:
|
||||||
chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)
|
chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)
|
||||||
|
|
||||||
if not chunks: return []
|
if not chunks: return []
|
||||||
|
|
||||||
# 2. WP-15b: Candidate Pool Vorbereitung
|
# 3. WP-15b: Candidate Pool Vorbereitung
|
||||||
|
|
||||||
# A. Edge Inheritance (Sektions-Propagation)
|
# A. Edge Inheritance (Sektions-Propagation)
|
||||||
chunks = _propagate_section_edges(chunks, blocks)
|
chunks = propagate_section_edges(chunks, blocks)
|
||||||
|
|
||||||
# B. Explicit Edges (Direkt im Chunk-Text enthalten)
|
# B. Explicit Edges (Direkt im Chunk-Text)
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
explicit = _parse_edges_robust(ch.text)
|
explicit = parse_edges_robust(ch.text)
|
||||||
for e_str in explicit:
|
for e_str in explicit:
|
||||||
kind, target = e_str.split(':', 1)
|
kind, target = e_str.split(':', 1)
|
||||||
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})
|
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})
|
||||||
|
|
||||||
# C. Global "Unassigned Pool" Detection (Safety Net)
|
# C. Global Pool Detection (Sektion 'Unzugeordnete Kanten')
|
||||||
# Sucht nach einer Sektion "Unzugeordnete Kanten" im Body
|
|
||||||
unassigned_pool = set()
|
|
||||||
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
|
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
|
||||||
if pool_match:
|
if pool_match:
|
||||||
unassigned_pool = _parse_edges_robust(pool_match.group(1))
|
unassigned = parse_edges_robust(pool_match.group(1))
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
for e_str in unassigned_pool:
|
for e_str in unassigned:
|
||||||
kind, target = e_str.split(':', 1)
|
kind, target = e_str.split(':', 1)
|
||||||
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})
|
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})
|
||||||
|
|
||||||
# D. De-Duplikation des Pools
|
# D. Eindeutigkeit sicherstellen
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
seen = set()
|
seen = set(); unique_pool = []
|
||||||
unique_pool = []
|
|
||||||
for cand in ch.candidate_pool:
|
for cand in ch.candidate_pool:
|
||||||
key = (cand["kind"], cand["to"])
|
key = (cand["kind"], cand["to"])
|
||||||
if key not in seen:
|
if key not in seen:
|
||||||
seen.add(key)
|
seen.add(key); unique_pool.append(cand)
|
||||||
unique_pool.append(cand)
|
|
||||||
ch.candidate_pool = unique_pool
|
ch.candidate_pool = unique_pool
|
||||||
|
|
||||||
# 3. Nachbarschafts-Verkettung (Struktur-Kanten)
|
# 4. Graph-Struktur (Nachbarschaft)
|
||||||
for i, ch in enumerate(chunks):
|
for i, ch in enumerate(chunks):
|
||||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||||
|
|
|
||||||
0
app/core/chunking/__init__.py
Normal file
0
app/core/chunking/__init__.py
Normal file
31
app/core/chunking/chunking_models.py
Normal file
31
app/core/chunking/chunking_models.py
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/chunking/chunking_models.py
|
||||||
|
DESCRIPTION: Datenklassen für das Chunking-System.
|
||||||
|
"""
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List, Dict, Optional, Any
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RawBlock:
|
||||||
|
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing."""
|
||||||
|
kind: str
|
||||||
|
text: str
|
||||||
|
level: Optional[int]
|
||||||
|
section_path: str
|
||||||
|
section_title: Optional[str]
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung."""
|
||||||
|
id: str
|
||||||
|
note_id: str
|
||||||
|
index: int
|
||||||
|
text: str
|
||||||
|
window: str
|
||||||
|
token_count: int
|
||||||
|
section_title: Optional[str]
|
||||||
|
section_path: str
|
||||||
|
neighbors_prev: Optional[str]
|
||||||
|
neighbors_next: Optional[str]
|
||||||
|
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
||||||
|
suggested_edges: Optional[List[str]] = None
|
||||||
74
app/core/chunking/chunking_parser.py
Normal file
74
app/core/chunking/chunking_parser.py
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/chunking/chunking_parser.py
|
||||||
|
DESCRIPTION: Zerlegt Markdown in Blöcke und extrahiert Kanten-Strings.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from typing import List, Tuple, Set
|
||||||
|
from .chunking_models import RawBlock
|
||||||
|
from .chunking_utils import extract_frontmatter_from_text
|
||||||
|
|
||||||
|
_WS = re.compile(r'\s+')
|
||||||
|
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||||
|
|
||||||
|
def split_sentences(text: str) -> list[str]:
|
||||||
|
"""Teilt Text in Sätze auf."""
|
||||||
|
text = _WS.sub(' ', text.strip())
|
||||||
|
if not text: return []
|
||||||
|
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
|
||||||
|
|
||||||
|
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
|
"""Zerlegt Text in logische Einheiten."""
|
||||||
|
blocks = []
|
||||||
|
h1_title = "Dokument"; section_path = "/"; current_h2 = None
|
||||||
|
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||||
|
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
||||||
|
if h1_match: h1_title = h1_match.group(1).strip()
|
||||||
|
lines = text_without_fm.split('\n')
|
||||||
|
buffer = []
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped.startswith('# '): continue
|
||||||
|
elif stripped.startswith('## '):
|
||||||
|
if buffer:
|
||||||
|
content = "\n".join(buffer).strip()
|
||||||
|
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||||
|
buffer = []
|
||||||
|
current_h2 = stripped[3:].strip()
|
||||||
|
section_path = f"/{current_h2}"
|
||||||
|
blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
|
||||||
|
elif not stripped:
|
||||||
|
if buffer:
|
||||||
|
content = "\n".join(buffer).strip()
|
||||||
|
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||||
|
buffer = []
|
||||||
|
else: buffer.append(line)
|
||||||
|
if buffer:
|
||||||
|
content = "\n".join(buffer).strip()
|
||||||
|
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||||
|
return blocks, h1_title
|
||||||
|
|
||||||
|
def parse_edges_robust(text: str) -> Set[str]:
|
||||||
|
"""Extrahiert Kanten-Kandidaten (Wikilinks, Callouts)."""
|
||||||
|
found_edges = set()
|
||||||
|
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||||
|
for kind, target in inlines:
|
||||||
|
k = kind.strip().lower()
|
||||||
|
t = target.strip()
|
||||||
|
if k and t: found_edges.add(f"{k}:{t}")
|
||||||
|
lines = text.split('\n')
|
||||||
|
current_edge_type = None
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
||||||
|
if callout_match:
|
||||||
|
current_edge_type = callout_match.group(1).strip().lower()
|
||||||
|
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||||
|
for l in links:
|
||||||
|
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||||
|
continue
|
||||||
|
if current_edge_type and stripped.startswith('>'):
|
||||||
|
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||||
|
for l in links:
|
||||||
|
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||||
|
elif not stripped.startswith('>'): current_edge_type = None
|
||||||
|
return found_edges
|
||||||
25
app/core/chunking/chunking_propagation.py
Normal file
25
app/core/chunking/chunking_propagation.py
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/chunking/chunking_propagation.py
|
||||||
|
DESCRIPTION: Vererbung von Kanten (Inheritance) über Sektions-Pfade.
|
||||||
|
"""
|
||||||
|
from typing import List, Dict, Set
|
||||||
|
from .chunking_models import Chunk, RawBlock
|
||||||
|
from .chunking_parser import parse_edges_robust
|
||||||
|
|
||||||
|
def propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]:
|
||||||
|
"""WP-15b: Kanten aus Headings werden an Sub-Chunks vererbt."""
|
||||||
|
section_inheritance: Dict[str, Set[str]] = {}
|
||||||
|
for b in blocks:
|
||||||
|
if b.kind == "heading":
|
||||||
|
edges = parse_edges_robust(b.text)
|
||||||
|
if edges:
|
||||||
|
if b.section_path not in section_inheritance:
|
||||||
|
section_inheritance[b.section_path] = set()
|
||||||
|
section_inheritance[b.section_path].update(edges)
|
||||||
|
|
||||||
|
for ch in chunks:
|
||||||
|
inherited = section_inheritance.get(ch.section_path, set())
|
||||||
|
for e_str in inherited:
|
||||||
|
kind, target = e_str.split(':', 1)
|
||||||
|
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"})
|
||||||
|
return chunks
|
||||||
74
app/core/chunking/chunking_strategies.py
Normal file
74
app/core/chunking/chunking_strategies.py
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/chunking/chunking_strategies.py
|
||||||
|
DESCRIPTION: Implementierung der mathematischen Splitting-Strategien.
|
||||||
|
"""
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from .chunking_models import RawBlock, Chunk
|
||||||
|
from .chunking_utils import estimate_tokens
|
||||||
|
from .chunking_parser import split_sentences
|
||||||
|
|
||||||
|
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||||
|
"""Fasst Blöcke zusammen und schneidet bei 'target' Tokens."""
|
||||||
|
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||||
|
overlap_val = config.get("overlap", (50, 80))
|
||||||
|
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
|
||||||
|
chunks = []; buf = []
|
||||||
|
|
||||||
|
def _add(txt, sec, path):
|
||||||
|
idx = len(chunks); win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
||||||
|
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None))
|
||||||
|
|
||||||
|
def flush():
|
||||||
|
nonlocal buf
|
||||||
|
if not buf: return
|
||||||
|
text_body = "\n\n".join([b.text for b in buf])
|
||||||
|
sec_title = buf[-1].section_title; sec_path = buf[-1].section_path
|
||||||
|
if estimate_tokens(text_body) <= max_tokens: _add(text_body, sec_title, sec_path)
|
||||||
|
else:
|
||||||
|
sents = split_sentences(text_body); cur_sents = []; cur_len = 0
|
||||||
|
for s in sents:
|
||||||
|
slen = estimate_tokens(s)
|
||||||
|
if cur_len + slen > target and cur_sents:
|
||||||
|
_add(" ".join(cur_sents), sec_title, sec_path)
|
||||||
|
ov_s = []; ov_l = 0
|
||||||
|
for os in reversed(cur_sents):
|
||||||
|
if ov_l + estimate_tokens(os) < overlap: ov_s.insert(0, os); ov_l += estimate_tokens(os)
|
||||||
|
else: break
|
||||||
|
cur_sents = list(ov_s); cur_sents.append(s); cur_len = ov_l + slen
|
||||||
|
else: cur_sents.append(s); cur_len += slen
|
||||||
|
if cur_sents: _add(" ".join(cur_sents), sec_title, sec_path)
|
||||||
|
buf = []
|
||||||
|
|
||||||
|
for b in blocks:
|
||||||
|
if b.kind == "heading": continue
|
||||||
|
if estimate_tokens("\n\n".join([x.text for x in buf])) + estimate_tokens(b.text) >= target: flush()
|
||||||
|
buf.append(b)
|
||||||
|
if estimate_tokens(b.text) >= target: flush()
|
||||||
|
flush()
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
||||||
|
"""Splittet Text basierend auf Markdown-Überschriften."""
|
||||||
|
strict = config.get("strict_heading_split", False); target = config.get("target", 400)
|
||||||
|
max_tokens = config.get("max", 600); split_level = config.get("split_level", 2)
|
||||||
|
chunks = []; buf = []; cur_tokens = 0
|
||||||
|
|
||||||
|
def _flush(title, path):
|
||||||
|
nonlocal buf, cur_tokens
|
||||||
|
if not buf: return
|
||||||
|
txt = "\n\n".join(buf); win = f"# {doc_title}\n## {title}\n{txt}".strip() if title else txt
|
||||||
|
idx = len(chunks)
|
||||||
|
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None))
|
||||||
|
buf = []; cur_tokens = 0
|
||||||
|
|
||||||
|
for b in blocks:
|
||||||
|
if b.kind == "heading":
|
||||||
|
if b.level < split_level: _flush(b.section_title, b.section_path)
|
||||||
|
elif b.level == split_level:
|
||||||
|
if strict or cur_tokens >= target: _flush(b.section_title, b.section_path)
|
||||||
|
continue
|
||||||
|
bt = estimate_tokens(b.text)
|
||||||
|
if cur_tokens + bt > max_tokens and buf: _flush(b.section_title, b.section_path)
|
||||||
|
buf.append(b.text); cur_tokens += bt
|
||||||
|
if buf: _flush(blocks[-1].section_title if blocks else None, blocks[-1].section_path if blocks else "/")
|
||||||
|
return chunks
|
||||||
55
app/core/chunking/chunking_utils.py
Normal file
55
app/core/chunking/chunking_utils.py
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/chunking/chunking_utils.py
|
||||||
|
DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration.
|
||||||
|
"""
|
||||||
|
import math
|
||||||
|
import yaml
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
|
||||||
|
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
|
||||||
|
|
||||||
|
_CONFIG_CACHE = None
|
||||||
|
|
||||||
|
def load_yaml_config() -> Dict[str, Any]:
|
||||||
|
global _CONFIG_CACHE
|
||||||
|
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
|
||||||
|
if not CONFIG_PATH.exists(): return {}
|
||||||
|
try:
|
||||||
|
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
_CONFIG_CACHE = data
|
||||||
|
return data
|
||||||
|
except Exception: return {}
|
||||||
|
|
||||||
|
def get_chunk_config(note_type: str) -> Dict[str, Any]:
|
||||||
|
"""Lädt die Chunking-Strategie basierend auf dem Note-Type."""
|
||||||
|
full_config = load_yaml_config()
|
||||||
|
profiles = full_config.get("chunking_profiles", {})
|
||||||
|
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
||||||
|
profile_name = type_def.get("chunking_profile") or full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||||
|
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
||||||
|
if "overlap" in config and isinstance(config["overlap"], list):
|
||||||
|
config["overlap"] = tuple(config["overlap"])
|
||||||
|
return config
|
||||||
|
|
||||||
|
def estimate_tokens(text: str) -> int:
|
||||||
|
"""Grobe Schätzung der Token-Anzahl."""
|
||||||
|
return max(1, math.ceil(len(text.strip()) / 4))
|
||||||
|
|
||||||
|
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
|
||||||
|
"""Trennt YAML-Frontmatter vom Text."""
|
||||||
|
import re
|
||||||
|
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
|
||||||
|
if not fm_match: return {}, md_text
|
||||||
|
try:
|
||||||
|
frontmatter = yaml.safe_load(fm_match.group(1))
|
||||||
|
if not isinstance(frontmatter, dict): frontmatter = {}
|
||||||
|
except Exception: frontmatter = {}
|
||||||
|
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
|
||||||
|
return frontmatter, text_without_fm.strip()
|
||||||
|
|
@ -1,373 +1,15 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/ingestion.py
|
FILE: app/core/ingestion.py
|
||||||
DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen.
|
DESCRIPTION: Facade für das Ingestion-Package. Stellt 100% Abwärtskompatibilität sicher.
|
||||||
WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free).
|
WP-14: Modularisierung der Ingestion-Pipeline abgeschlossen.
|
||||||
WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash.
|
Nutzt interne Module mit 'ingestion_' Präfix für maximale Wartbarkeit.
|
||||||
WP-15b: Two-Pass Ingestion mit LocalBatchCache & Candidate-Validation.
|
VERSION: 2.13.0
|
||||||
Sichert, dass explizite Kanten direkt übernommen und nur Pool-Kanten validiert werden.
|
|
||||||
FIX: Deep Fallback Logic (v2.11.14) für JSON-Recovery.
|
|
||||||
Robust Lookup Fix: Adressiert Notizen im Cache via ID, Titel und Dateiname.
|
|
||||||
VERSION: 2.12.2
|
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker,
|
|
||||||
app.services.llm_service, app.services.edge_registry
|
|
||||||
"""
|
"""
|
||||||
import os
|
# Export der Hauptklasse für externe Module (z.B. scripts/import_markdown.py)
|
||||||
import json
|
from .ingestion.ingestion_processor import IngestionService
|
||||||
import re
|
|
||||||
import logging
|
|
||||||
import asyncio
|
|
||||||
import time
|
|
||||||
from typing import Dict, List, Optional, Tuple, Any
|
|
||||||
|
|
||||||
# Core Module Imports
|
# Export der Hilfsfunktionen für Abwärtskompatibilität
|
||||||
from app.core.parser import (
|
from .ingestion.ingestion_utils import extract_json_from_response, load_type_registry
|
||||||
read_markdown,
|
|
||||||
pre_scan_markdown,
|
|
||||||
normalize_frontmatter,
|
|
||||||
validate_required_frontmatter,
|
|
||||||
extract_edges_with_context,
|
|
||||||
NoteContext
|
|
||||||
)
|
|
||||||
from app.core.note_payload import make_note_payload
|
|
||||||
from app.core.chunker import assemble_chunks, get_chunk_config
|
|
||||||
from app.core.chunk_payload import make_chunk_payloads
|
|
||||||
|
|
||||||
# Fallback für Edges
|
__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"]
|
||||||
try:
|
|
||||||
from app.core.derive_edges import build_edges_for_note
|
|
||||||
except ImportError:
|
|
||||||
def build_edges_for_note(*args, **kwargs): return []
|
|
||||||
|
|
||||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
|
||||||
from app.core.qdrant_points import (
|
|
||||||
points_for_chunks,
|
|
||||||
points_for_note,
|
|
||||||
points_for_edges,
|
|
||||||
upsert_batch,
|
|
||||||
)
|
|
||||||
|
|
||||||
from app.services.embeddings_client import EmbeddingsClient
|
|
||||||
from app.services.edge_registry import registry as edge_registry
|
|
||||||
from app.services.llm_service import LLMService
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# --- Global Helpers (Full Compatibility v2.11.14) ---
|
|
||||||
def extract_json_from_response(text: str) -> Any:
|
|
||||||
"""
|
|
||||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
|
|
||||||
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
|
||||||
"""
|
|
||||||
if not text or not isinstance(text, str):
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
|
|
||||||
clean = text.replace("<s>", "").replace("</s>", "")
|
|
||||||
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
|
||||||
clean = clean.strip()
|
|
||||||
|
|
||||||
# 2. Suche nach Markdown JSON-Blöcken (```json ... ```)
|
|
||||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
|
||||||
payload = match.group(1) if match else clean
|
|
||||||
|
|
||||||
try:
|
|
||||||
return json.loads(payload.strip())
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# 3. Recovery: Suche nach der ersten [ und letzten ] (Liste)
|
|
||||||
start = payload.find('[')
|
|
||||||
end = payload.rfind(']') + 1
|
|
||||||
if start != -1 and end > start:
|
|
||||||
try:
|
|
||||||
return json.loads(payload[start:end])
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
# 4. Zweite Recovery: Suche nach der ersten { und letzten } (Objekt)
|
|
||||||
start_obj = payload.find('{')
|
|
||||||
end_obj = payload.rfind('}') + 1
|
|
||||||
if start_obj != -1 and end_obj > start_obj:
|
|
||||||
try:
|
|
||||||
return json.loads(payload[start_obj:end_obj])
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
|
||||||
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
|
||||||
import yaml
|
|
||||||
from app.config import get_settings
|
|
||||||
settings = get_settings()
|
|
||||||
path = custom_path or settings.MINDNET_TYPES_FILE
|
|
||||||
if not os.path.exists(path): return {}
|
|
||||||
try:
|
|
||||||
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
|
||||||
except Exception: return {}
|
|
||||||
|
|
||||||
# --- Service Class ---
|
|
||||||
class IngestionService:
|
|
||||||
def __init__(self, collection_prefix: str = None):
|
|
||||||
from app.config import get_settings
|
|
||||||
self.settings = get_settings()
|
|
||||||
|
|
||||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
|
||||||
self.cfg = QdrantConfig.from_env()
|
|
||||||
self.cfg.prefix = self.prefix
|
|
||||||
self.client = get_client(self.cfg)
|
|
||||||
self.dim = self.settings.VECTOR_SIZE
|
|
||||||
self.registry = load_type_registry()
|
|
||||||
self.embedder = EmbeddingsClient()
|
|
||||||
self.llm = LLMService()
|
|
||||||
|
|
||||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
|
||||||
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
|
|
||||||
|
|
||||||
try:
|
|
||||||
ensure_collections(self.client, self.prefix, self.dim)
|
|
||||||
ensure_payload_indexes(self.client, self.prefix)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"DB init warning: {e}")
|
|
||||||
|
|
||||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
|
|
||||||
Pass 1: Pre-Scan baut flüchtigen Kontext-Cache auf.
|
|
||||||
Pass 2: Processing führt die eigentliche semantische Validierung durch.
|
|
||||||
"""
|
|
||||||
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
|
||||||
for path in file_paths:
|
|
||||||
ctx = pre_scan_markdown(path)
|
|
||||||
if ctx:
|
|
||||||
# Mehrfache Indizierung für robusten Look-up (WP-15b Fix)
|
|
||||||
self.batch_cache[ctx.note_id] = ctx
|
|
||||||
self.batch_cache[ctx.title] = ctx
|
|
||||||
# Dateiname ohne Endung als dritter Schlüssel
|
|
||||||
fname = os.path.splitext(os.path.basename(path))[0]
|
|
||||||
self.batch_cache[fname] = ctx
|
|
||||||
|
|
||||||
logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...")
|
|
||||||
results = []
|
|
||||||
for path in file_paths:
|
|
||||||
res = await self.process_file(path, vault_root, apply=True)
|
|
||||||
results.append(res)
|
|
||||||
return results
|
|
||||||
|
|
||||||
async def _validate_candidate(self, chunk_text: str, edge: Dict) -> bool:
|
|
||||||
"""
|
|
||||||
WP-15b: Validiert einen Kanten-Kandidaten semantisch gegen das Ziel.
|
|
||||||
Nutzt den Cache aus Pass 1, um dem LLM Kontext der Ziel-Note zu geben.
|
|
||||||
"""
|
|
||||||
target_id = edge.get("to")
|
|
||||||
target_ctx = self.batch_cache.get(target_id)
|
|
||||||
|
|
||||||
# Fallback Look-up für Links mit Ankern (Anchor entfernen)
|
|
||||||
if not target_ctx and "#" in target_id:
|
|
||||||
base_id = target_id.split("#")[0]
|
|
||||||
target_ctx = self.batch_cache.get(base_id)
|
|
||||||
|
|
||||||
# Sicherheits-Fallback: Wenn Zielnotiz nicht im aktuellen Batch ist,
|
|
||||||
# lassen wir die Kante als 'explicit' durch (Hard-Link Integrity).
|
|
||||||
if not target_ctx:
|
|
||||||
logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
|
|
||||||
return True
|
|
||||||
|
|
||||||
provider = self.settings.MINDNET_LLM_PROVIDER
|
|
||||||
template = self.llm.get_prompt("edge_validation", provider)
|
|
||||||
|
|
||||||
try:
|
|
||||||
logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...")
|
|
||||||
prompt = template.format(
|
|
||||||
chunk_text=chunk_text[:1500],
|
|
||||||
target_title=target_ctx.title,
|
|
||||||
target_summary=target_ctx.summary,
|
|
||||||
edge_kind=edge.get("kind", "related_to")
|
|
||||||
)
|
|
||||||
|
|
||||||
response = await self.llm.generate_raw_response(prompt, priority="background")
|
|
||||||
is_valid = "YES" in response.upper()
|
|
||||||
|
|
||||||
if is_valid:
|
|
||||||
logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
|
|
||||||
else:
|
|
||||||
logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
|
|
||||||
|
|
||||||
return is_valid
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"⚠️ Semantic validation error for {target_id}: {e}")
|
|
||||||
return True # Fallback: Im Zweifel Link behalten
|
|
||||||
|
|
||||||
def _resolve_note_type(self, requested: Optional[str]) -> str:
|
|
||||||
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
|
|
||||||
types = self.registry.get("types", {})
|
|
||||||
if requested and requested in types: return requested
|
|
||||||
return "concept"
|
|
||||||
|
|
||||||
def _get_chunk_config_by_profile(self, profile_name: str, note_type: str) -> Dict[str, Any]:
|
|
||||||
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
|
||||||
profiles = self.registry.get("chunking_profiles", {})
|
|
||||||
if profile_name in profiles:
|
|
||||||
cfg = profiles[profile_name].copy()
|
|
||||||
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
|
||||||
cfg["overlap"] = tuple(cfg["overlap"])
|
|
||||||
return cfg
|
|
||||||
return get_chunk_config(note_type)
|
|
||||||
|
|
||||||
async def process_file(
|
|
||||||
self, file_path: str, vault_root: str,
|
|
||||||
force_replace: bool = False, apply: bool = False, purge_before: bool = False,
|
|
||||||
note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical"
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Transformiert eine Markdown-Datei in den Graphen."""
|
|
||||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
|
||||||
|
|
||||||
# 1. Parse & Lifecycle Gate
|
|
||||||
try:
|
|
||||||
parsed = read_markdown(file_path)
|
|
||||||
if not parsed: return {**result, "error": "Empty file"}
|
|
||||||
fm = normalize_frontmatter(parsed.frontmatter)
|
|
||||||
validate_required_frontmatter(fm)
|
|
||||||
except Exception as e:
|
|
||||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
|
||||||
|
|
||||||
# Lifecycle Filter (WP-22)
|
|
||||||
status = fm.get("status", "draft").lower().strip()
|
|
||||||
if status in ["system", "template", "archive", "hidden"]:
|
|
||||||
return {**result, "status": "skipped", "reason": f"lifecycle_{status}"}
|
|
||||||
|
|
||||||
# 2. Config Resolution & Payload
|
|
||||||
note_type = self._resolve_note_type(fm.get("type"))
|
|
||||||
fm["type"] = note_type
|
|
||||||
|
|
||||||
try:
|
|
||||||
note_pl = make_note_payload(parsed, vault_root=vault_root, hash_normalize=hash_normalize, hash_source=hash_source, file_path=file_path)
|
|
||||||
note_id = note_pl["note_id"]
|
|
||||||
except Exception as e:
|
|
||||||
return {**result, "error": f"Payload failed: {str(e)}"}
|
|
||||||
|
|
||||||
# 3. Change Detection (v2.11.14 Logic)
|
|
||||||
old_payload = None if force_replace else self._fetch_note_payload(note_id)
|
|
||||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
|
||||||
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
|
||||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
|
||||||
|
|
||||||
chunks_missing, edges_missing = self._artifacts_missing(note_id)
|
|
||||||
should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing
|
|
||||||
|
|
||||||
if not should_write:
|
|
||||||
return {**result, "status": "unchanged", "note_id": note_id}
|
|
||||||
|
|
||||||
if not apply:
|
|
||||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
|
||||||
|
|
||||||
# 4. Processing (Chunking, Embedding, Validated Edges)
|
|
||||||
try:
|
|
||||||
body_text = getattr(parsed, "body", "") or ""
|
|
||||||
edge_registry.ensure_latest()
|
|
||||||
|
|
||||||
# Chunker Resolution
|
|
||||||
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
|
|
||||||
chunk_cfg = self._get_chunk_config_by_profile(profile, note_type)
|
|
||||||
enable_smart_edges = chunk_cfg.get("enable_smart_edge_allocation", False)
|
|
||||||
|
|
||||||
# WP-15b: Chunker bereitet nun den Candidate-Pool vor (inkl. Inheritance).
|
|
||||||
chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg)
|
|
||||||
|
|
||||||
# WP-15b: Validierung NUR für Kandidaten aus dem global_pool (Unzugeordnete Kanten)
|
|
||||||
for ch_obj in chunks:
|
|
||||||
filtered_pool = []
|
|
||||||
for cand in getattr(ch_obj, "candidate_pool", []):
|
|
||||||
# Nur 'global_pool' erfordert LLM-Validierung.
|
|
||||||
# 'explicit' und 'inherited' werden direkt akzeptiert.
|
|
||||||
if cand.get("provenance") == "global_pool" and enable_smart_edges:
|
|
||||||
if await self._validate_candidate(ch_obj.text, cand):
|
|
||||||
filtered_pool.append(cand)
|
|
||||||
else:
|
|
||||||
filtered_pool.append(cand)
|
|
||||||
ch_obj.candidate_pool = filtered_pool
|
|
||||||
|
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
|
|
||||||
|
|
||||||
# Embeddings generieren
|
|
||||||
vecs = []
|
|
||||||
if chunk_pls:
|
|
||||||
texts = [c.get("window") or c.get("text") or "" for c in chunk_pls]
|
|
||||||
vecs = await self.embedder.embed_documents(texts)
|
|
||||||
|
|
||||||
# Kanten finalisieren via derive_edges Aggregator (WP-15b kompatibel)
|
|
||||||
# Nutzt das Provenance-Ranking (v2.1.0).
|
|
||||||
edges = build_edges_for_note(
|
|
||||||
note_id,
|
|
||||||
chunk_pls,
|
|
||||||
note_level_references=note_pl.get("references", []),
|
|
||||||
include_note_scope_refs=note_scope_refs
|
|
||||||
)
|
|
||||||
|
|
||||||
# Alias-Auflösung & Registry Enforcement
|
|
||||||
context = {"file": file_path, "note_id": note_id}
|
|
||||||
for e in edges:
|
|
||||||
e["kind"] = edge_registry.resolve(
|
|
||||||
edge_type=e.get("kind", "related_to"),
|
|
||||||
provenance=e.get("provenance", "explicit"),
|
|
||||||
context={**context, "line": e.get("line", "system")}
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Processing failed for {file_path}: {e}", exc_info=True)
|
|
||||||
return {**result, "error": f"Processing failed: {str(e)}"}
|
|
||||||
|
|
||||||
# 5. DB Upsert
|
|
||||||
try:
|
|
||||||
if purge_before and old_payload: self._purge_artifacts(note_id)
|
|
||||||
|
|
||||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
|
||||||
upsert_batch(self.client, n_name, n_pts)
|
|
||||||
|
|
||||||
if chunk_pls and vecs:
|
|
||||||
# v2.11.14 Points-Extraction Logic
|
|
||||||
c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1]
|
|
||||||
upsert_batch(self.client, f"{self.prefix}_chunks", c_pts)
|
|
||||||
|
|
||||||
if edges:
|
|
||||||
# v2.11.14 Points-Extraction Logic
|
|
||||||
e_pts = points_for_edges(self.prefix, edges)[1]
|
|
||||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
|
||||||
|
|
||||||
return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)}
|
|
||||||
except Exception as e:
|
|
||||||
return {**result, "error": f"DB Upsert failed: {e}"}
|
|
||||||
|
|
||||||
def _fetch_note_payload(self, note_id: str) -> Optional[dict]:
|
|
||||||
"""Holt die Metadaten einer Note aus Qdrant."""
|
|
||||||
from qdrant_client.http import models as rest
|
|
||||||
try:
|
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
|
||||||
pts, _ = self.client.scroll(collection_name=f"{self.prefix}_notes", scroll_filter=f, limit=1, with_payload=True)
|
|
||||||
return pts[0].payload if pts else None
|
|
||||||
except: return None
|
|
||||||
|
|
||||||
def _artifacts_missing(self, note_id: str) -> Tuple[bool, bool]:
|
|
||||||
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges."""
|
|
||||||
from qdrant_client.http import models as rest
|
|
||||||
try:
|
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
|
||||||
c_pts, _ = self.client.scroll(collection_name=f"{self.prefix}_chunks", scroll_filter=f, limit=1)
|
|
||||||
e_pts, _ = self.client.scroll(collection_name=f"{self.prefix}_edges", scroll_filter=f, limit=1)
|
|
||||||
return (not bool(c_pts)), (not bool(e_pts))
|
|
||||||
except: return True, True
|
|
||||||
|
|
||||||
def _purge_artifacts(self, note_id: str):
|
|
||||||
"""Löscht verwaiste Chunks/Edges vor einem Re-Import."""
|
|
||||||
from qdrant_client.http import models as rest
|
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
|
||||||
for suffix in ["chunks", "edges"]:
|
|
||||||
try: self.client.delete(collection_name=f"{self.prefix}_{suffix}", points_selector=rest.FilterSelector(filter=f))
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
|
||||||
"""Hilfsmethode zur Erstellung einer Note aus einem Textstream."""
|
|
||||||
target_dir = os.path.join(vault_root, folder)
|
|
||||||
os.makedirs(target_dir, exist_ok=True)
|
|
||||||
file_path = os.path.join(target_dir, filename)
|
|
||||||
with open(file_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(markdown_content)
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
|
||||||
0
app/core/ingestion/__init__.py
Normal file
0
app/core/ingestion/__init__.py
Normal file
31
app/core/ingestion/ingestion_db.py
Normal file
31
app/core/ingestion/ingestion_db.py
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/ingestion/ingestion_db.py
|
||||||
|
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
||||||
|
"""
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
|
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
||||||
|
"""Holt die Metadaten einer Note aus Qdrant via Scroll."""
|
||||||
|
try:
|
||||||
|
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
|
pts, _ = client.scroll(collection_name=f"{prefix}_notes", scroll_filter=f, limit=1, with_payload=True)
|
||||||
|
return pts[0].payload if pts else None
|
||||||
|
except: return None
|
||||||
|
|
||||||
|
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
||||||
|
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges."""
|
||||||
|
try:
|
||||||
|
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
|
c_pts, _ = client.scroll(collection_name=f"{prefix}_chunks", scroll_filter=f, limit=1)
|
||||||
|
e_pts, _ = client.scroll(collection_name=f"{prefix}_edges", scroll_filter=f, limit=1)
|
||||||
|
return (not bool(c_pts)), (not bool(e_pts))
|
||||||
|
except: return True, True
|
||||||
|
|
||||||
|
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
||||||
|
"""Löscht verwaiste Chunks/Edges vor einem Re-Import."""
|
||||||
|
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
|
for suffix in ["chunks", "edges"]:
|
||||||
|
try: client.delete(collection_name=f"{prefix}_{suffix}", points_selector=rest.FilterSelector(filter=f))
|
||||||
|
except: pass
|
||||||
152
app/core/ingestion/ingestion_processor.py
Normal file
152
app/core/ingestion/ingestion_processor.py
Normal file
|
|
@ -0,0 +1,152 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/ingestion/ingestion_processor.py
|
||||||
|
DESCRIPTION: Orchestriert den Ingestion-Prozess (Parsing -> Chunking -> Validierung -> DB).
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
from typing import Dict, List, Optional, Tuple, Any
|
||||||
|
|
||||||
|
from app.core.parser import (
|
||||||
|
read_markdown, pre_scan_markdown, normalize_frontmatter,
|
||||||
|
validate_required_frontmatter, NoteContext
|
||||||
|
)
|
||||||
|
from app.core.note_payload import make_note_payload
|
||||||
|
from app.core.chunker import assemble_chunks
|
||||||
|
from app.core.chunk_payload import make_chunk_payloads
|
||||||
|
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||||
|
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
|
||||||
|
|
||||||
|
from app.services.embeddings_client import EmbeddingsClient
|
||||||
|
from app.services.edge_registry import registry as edge_registry
|
||||||
|
from app.services.llm_service import LLMService
|
||||||
|
|
||||||
|
# Package-Interne Imports
|
||||||
|
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
||||||
|
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts
|
||||||
|
from .ingestion_validation import validate_edge_candidate
|
||||||
|
|
||||||
|
# Fallback für Edges
|
||||||
|
try:
|
||||||
|
from app.core.derive_edges import build_edges_for_note
|
||||||
|
except ImportError:
|
||||||
|
def build_edges_for_note(*args, **kwargs): return []
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class IngestionService:
|
||||||
|
def __init__(self, collection_prefix: str = None):
|
||||||
|
from app.config import get_settings
|
||||||
|
self.settings = get_settings()
|
||||||
|
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||||
|
self.cfg = QdrantConfig.from_env()
|
||||||
|
self.cfg.prefix = self.prefix
|
||||||
|
self.client = get_client(self.cfg)
|
||||||
|
self.dim = self.settings.VECTOR_SIZE
|
||||||
|
self.registry = load_type_registry()
|
||||||
|
self.embedder = EmbeddingsClient()
|
||||||
|
self.llm = LLMService()
|
||||||
|
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||||
|
self.batch_cache: Dict[str, NoteContext] = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
ensure_collections(self.client, self.prefix, self.dim)
|
||||||
|
ensure_payload_indexes(self.client, self.prefix)
|
||||||
|
except Exception as e: logger.warning(f"DB init warning: {e}")
|
||||||
|
|
||||||
|
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
||||||
|
"""WP-15b: Two-Pass Ingestion Workflow."""
|
||||||
|
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
||||||
|
for path in file_paths:
|
||||||
|
ctx = pre_scan_markdown(path)
|
||||||
|
if ctx:
|
||||||
|
self.batch_cache[ctx.note_id] = ctx
|
||||||
|
self.batch_cache[ctx.title] = ctx
|
||||||
|
import os
|
||||||
|
fname = os.path.splitext(os.path.basename(path))[0]
|
||||||
|
self.batch_cache[fname] = ctx
|
||||||
|
|
||||||
|
logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...")
|
||||||
|
return [await self.process_file(p, vault_root, apply=True) for p in file_paths]
|
||||||
|
|
||||||
|
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Transformiert eine Markdown-Datei in den Graphen."""
|
||||||
|
apply = kwargs.get("apply", False)
|
||||||
|
force_replace = kwargs.get("force_replace", False)
|
||||||
|
purge_before = kwargs.get("purge_before", False)
|
||||||
|
hash_source = kwargs.get("hash_source", "parsed")
|
||||||
|
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||||
|
|
||||||
|
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||||
|
|
||||||
|
# 1. Parse & Lifecycle
|
||||||
|
try:
|
||||||
|
parsed = read_markdown(file_path)
|
||||||
|
if not parsed: return {**result, "error": "Empty file"}
|
||||||
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
|
validate_required_frontmatter(fm)
|
||||||
|
except Exception as e: return {**result, "error": f"Validation failed: {str(e)}"}
|
||||||
|
|
||||||
|
if fm.get("status", "draft").lower().strip() in ["system", "template", "archive", "hidden"]:
|
||||||
|
return {**result, "status": "skipped", "reason": "lifecycle_filter"}
|
||||||
|
|
||||||
|
# 2. Payload & Change Detection
|
||||||
|
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||||
|
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, hash_source=hash_source, hash_normalize=hash_normalize)
|
||||||
|
note_id = note_pl["note_id"]
|
||||||
|
|
||||||
|
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||||
|
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
||||||
|
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
||||||
|
new_hash = note_pl.get("hashes", {}).get(check_key)
|
||||||
|
|
||||||
|
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||||
|
if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss):
|
||||||
|
return {**result, "status": "unchanged", "note_id": note_id}
|
||||||
|
|
||||||
|
if not apply: return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||||
|
|
||||||
|
# 3. Processing
|
||||||
|
try:
|
||||||
|
body_text = getattr(parsed, "body", "") or ""
|
||||||
|
edge_registry.ensure_latest()
|
||||||
|
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
|
||||||
|
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||||
|
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||||
|
|
||||||
|
chunks = await assemble_chunks(fm["id"], body_text, note_type, config=chunk_cfg)
|
||||||
|
for ch in chunks:
|
||||||
|
filtered = []
|
||||||
|
for cand in getattr(ch, "candidate_pool", []):
|
||||||
|
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||||
|
if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER):
|
||||||
|
filtered.append(cand)
|
||||||
|
else: filtered.append(cand)
|
||||||
|
ch.candidate_pool = filtered
|
||||||
|
|
||||||
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
|
||||||
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
|
edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
||||||
|
for e in edges:
|
||||||
|
e["kind"] = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"), context={"file": file_path, "note_id": note_id})
|
||||||
|
|
||||||
|
# 4. DB Upsert
|
||||||
|
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||||
|
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||||
|
upsert_batch(self.client, n_name, n_pts)
|
||||||
|
if chunk_pls and vecs: upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1])
|
||||||
|
if edges: upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, edges)[1])
|
||||||
|
|
||||||
|
return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Processing failed: {e}", exc_info=True)
|
||||||
|
return {**result, "error": str(e)}
|
||||||
|
|
||||||
|
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||||
|
import os
|
||||||
|
target_dir = os.path.join(vault_root, folder)
|
||||||
|
os.makedirs(target_dir, exist_ok=True)
|
||||||
|
file_path = os.path.join(target_dir, filename)
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f: f.write(markdown_content)
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||||
69
app/core/ingestion/ingestion_utils.py
Normal file
69
app/core/ingestion/ingestion_utils.py
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/ingestion/ingestion_utils.py
|
||||||
|
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import yaml
|
||||||
|
from typing import Any, Optional, Dict
|
||||||
|
|
||||||
|
def extract_json_from_response(text: str) -> Any:
|
||||||
|
"""
|
||||||
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
|
||||||
|
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
||||||
|
"""
|
||||||
|
if not text or not isinstance(text, str):
|
||||||
|
return []
|
||||||
|
|
||||||
|
clean = text.replace("<s>", "").replace("</s>", "")
|
||||||
|
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
||||||
|
clean = clean.strip()
|
||||||
|
|
||||||
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||||
|
payload = match.group(1) if match else clean
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads(payload.strip())
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Recovery: Suche nach Liste
|
||||||
|
start = payload.find('[')
|
||||||
|
end = payload.rfind(']') + 1
|
||||||
|
if start != -1 and end > start:
|
||||||
|
try: return json.loads(payload[start:end])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
# Recovery: Suche nach Objekt
|
||||||
|
start_obj = payload.find('{')
|
||||||
|
end_obj = payload.rfind('}') + 1
|
||||||
|
if start_obj != -1 and end_obj > start_obj:
|
||||||
|
try: return json.loads(payload[start_obj:end_obj])
|
||||||
|
except: pass
|
||||||
|
return []
|
||||||
|
|
||||||
|
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
||||||
|
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
||||||
|
from app.config import get_settings
|
||||||
|
settings = get_settings()
|
||||||
|
path = custom_path or settings.MINDNET_TYPES_FILE
|
||||||
|
if not os.path.exists(path): return {}
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
||||||
|
except Exception: return {}
|
||||||
|
|
||||||
|
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
||||||
|
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
|
||||||
|
types = registry.get("types", {})
|
||||||
|
if requested and requested in types: return requested
|
||||||
|
return "concept"
|
||||||
|
|
||||||
|
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||||
|
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
||||||
|
from app.core.chunker import get_chunk_config
|
||||||
|
profiles = registry.get("chunking_profiles", {})
|
||||||
|
if profile_name in profiles:
|
||||||
|
cfg = profiles[profile_name].copy()
|
||||||
|
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
||||||
|
cfg["overlap"] = tuple(cfg["overlap"])
|
||||||
|
return cfg
|
||||||
|
return get_chunk_config(note_type)
|
||||||
53
app/core/ingestion/ingestion_validation.py
Normal file
53
app/core/ingestion/ingestion_validation.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
"""
|
||||||
|
FILE: app/core/ingestion/ingestion_validation.py
|
||||||
|
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any
|
||||||
|
from app.core.parser import NoteContext
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def validate_edge_candidate(
|
||||||
|
chunk_text: str,
|
||||||
|
edge: Dict,
|
||||||
|
batch_cache: Dict[str, NoteContext],
|
||||||
|
llm_service: Any,
|
||||||
|
provider: str
|
||||||
|
) -> bool:
|
||||||
|
"""WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache."""
|
||||||
|
target_id = edge.get("to")
|
||||||
|
target_ctx = batch_cache.get(target_id)
|
||||||
|
|
||||||
|
# Robust Lookup Fix (v2.12.2): Support für Anker
|
||||||
|
if not target_ctx and "#" in target_id:
|
||||||
|
base_id = target_id.split("#")[0]
|
||||||
|
target_ctx = batch_cache.get(base_id)
|
||||||
|
|
||||||
|
# Sicherheits-Fallback (Hard-Link Integrity)
|
||||||
|
if not target_ctx:
|
||||||
|
logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
|
||||||
|
return True
|
||||||
|
|
||||||
|
template = llm_service.get_prompt("edge_validation", provider)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...")
|
||||||
|
prompt = template.format(
|
||||||
|
chunk_text=chunk_text[:1500],
|
||||||
|
target_title=target_ctx.title,
|
||||||
|
target_summary=target_ctx.summary,
|
||||||
|
edge_kind=edge.get("kind", "related_to")
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await llm_service.generate_raw_response(prompt, priority="background")
|
||||||
|
is_valid = "YES" in response.upper()
|
||||||
|
|
||||||
|
if is_valid:
|
||||||
|
logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
|
||||||
|
else:
|
||||||
|
logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
|
||||||
|
return is_valid
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Validation error for {target_id}: {e}")
|
||||||
|
return True
|
||||||
Loading…
Reference in New Issue
Block a user