mindnet/app/core/chunker.py

"""
FILE: app/core/chunker.py
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings).
             WP-15b: Implementiert Edge-Inheritance und Candidate-Pool Vorbereitung.
             Zentralisiert die Kanten-Vorbereitung für die spätere binäre Validierung.
             Bietet volle Unterstützung für Hybrid-Chunking (Strict/Soft/Safety-Net).
VERSION: 3.2.0
STATUS: Active
DEPENDENCIES: re, math, yaml, pathlib, asyncio, logging
"""

from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple, Any, Set
import re
import math
import yaml
from pathlib import Path
import asyncio
import logging

# Services
# In WP-15b wird die KI-Validierung in die ingestion.py verlagert.
# Wir behalten den Import für Abwärtskompatibilität, falls Legacy-Skripte ihn benötigen.
try:
    from app.services.semantic_analyzer import get_semantic_analyzer
except ImportError:
    def get_semantic_analyzer(): return None

# Core Imports
try:
    from app.core.derive_edges import build_edges_for_note
except ImportError:
    # Fallback für Standalone-Betrieb oder Tests
    def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []

logger = logging.getLogger(__name__)

# ==========================================
# 1. HELPER & CONFIG
# ==========================================

BASE_DIR = Path(__file__).resolve().parent.parent.parent
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
# Fallback Default, falls types.yaml fehlt
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
_CONFIG_CACHE = None

def _load_yaml_config() -> Dict[str, Any]:
    global _CONFIG_CACHE
    if _CONFIG_CACHE is not None: return _CONFIG_CACHE
    if not CONFIG_PATH.exists(): return {}
    try:
        with open(CONFIG_PATH, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)
            _CONFIG_CACHE = data
            return data
    except Exception: return {}

def get_chunk_config(note_type: str) -> Dict[str, Any]:
    """
    Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml.
    Sichert die Kompatibilität zu WP-15 Profilen.
    """
    full_config = _load_yaml_config()
    profiles = full_config.get("chunking_profiles", {})
    type_def = full_config.get("types", {}).get(note_type.lower(), {})

    # Welches Profil nutzt dieser Typ? (z.B. 'sliding_smart_edges')
    profile_name = type_def.get("chunking_profile")

    if not profile_name:
        profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")

    config = profiles.get(profile_name, DEFAULT_PROFILE).copy()

    # Tupel-Konvertierung für Overlap (YAML liest oft Listen)
    if "overlap" in config and isinstance(config["overlap"], list):
        config["overlap"] = tuple(config["overlap"])

    return config

def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
    """Trennt YAML-Frontmatter vom eigentlichen Text."""
    fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
    if not fm_match: return {}, md_text
    try:
        frontmatter = yaml.safe_load(fm_match.group(1))
        if not isinstance(frontmatter, dict): frontmatter = {}
    except yaml.YAMLError:
        frontmatter = {}
    text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
    return frontmatter, text_without_fm.strip()

# ==========================================
# 2. DATA CLASSES & TEXT TOOLS
# ==========================================

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
_WS = re.compile(r'\s+')

def estimate_tokens(text: str) -> int:
    """Grobe Schätzung der Token-Anzahl (4 Zeichen pro Token)."""
    return max(1, math.ceil(len(text.strip()) / 4))

def split_sentences(text: str) -> list[str]:
    """Teilt Text in Sätze auf unter Berücksichtigung von Interpunktion."""
    text = _WS.sub(' ', text.strip())
    if not text: return []
    parts = _SENT_SPLIT.split(text)
    return [p.strip() for p in parts if p.strip()]

@dataclass
class RawBlock:
    kind: str
    text: str
    level: Optional[int]
    section_path: str
    section_title: Optional[str]

@dataclass
class Chunk:
    id: str
    note_id: str
    index: int
    text: str
    window: str
    token_count: int
    section_title: Optional[str]
    section_path: str
    neighbors_prev: Optional[str]
    neighbors_next: Optional[str]
    # WP-15b: Liste von Kandidaten für die semantische Validierung
    candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
    suggested_edges: Optional[List[str]] = None

# ==========================================
# 3. PARSING & STRATEGIES
# ==========================================

def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """
    Zerlegt Text in logische Blöcke (Absätze, Header).
    Wichtig für die Strategie 'by_heading' und die Edge-Inheritance.
    """
    blocks = []
    h1_title = "Dokument"
    section_path = "/"
    current_h2 = None

    fm, text_without_fm = extract_frontmatter_from_text(md_text)

    h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
    if h1_match:
        h1_title = h1_match.group(1).strip()

    lines = text_without_fm.split('\n')
    buffer = []

    for line in lines:
        stripped = line.strip()
        if stripped.startswith('# '):
            continue
        elif stripped.startswith('## '):
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
                buffer = []
            current_h2 = stripped[3:].strip()
            section_path = f"/{current_h2}"
            blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
        elif not stripped:
            if buffer:
                content = "\n".join(buffer).strip()
                if content:
                    blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
                buffer = []
        else:
            buffer.append(line)

    if buffer:
        content = "\n".join(buffer).strip()
        if content:
            blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))

    return blocks, h1_title

def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
    """
    Standard-Strategie aus WP-15.
    Fasst Blöcke zusammen und schneidet bei 'target' Tokens.
    """
    target = config.get("target", 400)
    max_tokens = config.get("max", 600)
    overlap_val = config.get("overlap", (50, 80))
    overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
    chunks = []
    buf = []

    def _create_chunk(txt, win, sec, path):
        idx = len(chunks)
        chunks.append(Chunk(
            id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
            text=txt, window=win, token_count=estimate_tokens(txt),
            section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
            candidate_pool=[]
        ))

    def flush_buffer():
        nonlocal buf
        if not buf: return

        text_body = "\n\n".join([b.text for b in buf])
        sec_title = buf[-1].section_title if buf else None
        sec_path = buf[-1].section_path if buf else "/"
        win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body

        if estimate_tokens(text_body) <= max_tokens:
            _create_chunk(text_body, win_body, sec_title, sec_path)
        else:
            sentences = split_sentences(text_body)
            current_chunk_sents = []
            current_len = 0

            for sent in sentences:
                sent_len = estimate_tokens(sent)
                if current_len + sent_len > target and current_chunk_sents:
                    c_txt = " ".join(current_chunk_sents)
                    c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
                    _create_chunk(c_txt, c_win, sec_title, sec_path)

                    overlap_sents = []
                    ov_len = 0
                    for s in reversed(current_chunk_sents):
                        if ov_len + estimate_tokens(s) < overlap:
                            overlap_sents.insert(0, s)
                            ov_len += estimate_tokens(s)
                        else: break

                    current_chunk_sents = list(overlap_sents)
                    current_chunk_sents.append(sent)
                    current_len = ov_len + sent_len
                else:
                    current_chunk_sents.append(sent)
                    current_len += sent_len

            if current_chunk_sents:
                c_txt = " ".join(current_chunk_sents)
                c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
                _create_chunk(c_txt, c_win, sec_title, sec_path)
        buf = []

    for b in blocks:
        if b.kind == "heading": continue
        current_buf_text = "\n\n".join([x.text for x in buf])
        if estimate_tokens(current_buf_text) + estimate_tokens(b.text) >= target:
            flush_buffer()
        buf.append(b)
        if estimate_tokens(b.text) >= target:
            flush_buffer()

    flush_buffer()
    return chunks

def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
    """
    Hybrid-Strategie v2.9 (Strict/Soft/Safety-Net).
    """
    strict = config.get("strict_heading_split", False)
    target = config.get("target", 400)
    max_tokens = config.get("max", 600)
    split_level = config.get("split_level", 2)

    chunks = []
    current_buf = []
    current_tokens = 0

    def _flush(sec_title, sec_path):
        nonlocal current_buf, current_tokens
        if not current_buf: return
        txt = "\n\n".join(current_buf)
        win = f"# {doc_title}\n## {sec_title}\n{txt}".strip() if sec_title else txt
        idx = len(chunks)
        chunks.append(Chunk(
            id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
            text=txt, window=win, token_count=estimate_tokens(txt),
            section_title=sec_title, section_path=sec_path,
            neighbors_prev=None, neighbors_next=None,
            candidate_pool=[]
        ))
        current_buf = []
        current_tokens = 0

    for b in blocks:
        if b.kind == "heading":
            # Hierarchie-Check: Split bei Überschriften oberhalb des Split-Levels
            if b.level < split_level:
                _flush(b.section_title, b.section_path)
            elif b.level == split_level:
                if strict or current_tokens >= target:
                    _flush(b.section_title, b.section_path)
            continue

        block_tokens = estimate_tokens(b.text)
        if current_tokens + block_tokens > max_tokens and current_buf:
            _flush(b.section_title, b.section_path)

        current_buf.append(b.text)
        current_tokens += block_tokens

    if current_buf:
        last = blocks[-1] if blocks else None
        _flush(last.section_title if last else None, last.section_path if last else "/")

    return chunks

# ==========================================
# 4. ROBUST EDGE PARSING & PROPAGATION
# ==========================================

def _parse_edges_robust(text: str) -> Set[str]:
    """
    Findet Kanten im Text (Wikilinks, Inlines, Callouts).
    Fix V3: Support für mehrzeilige Callouts.
    """
    found_edges = set()

    # A. Inline [[rel:type|target]]
    inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
    for kind, target in inlines:
        k = kind.strip().lower()
        t = target.strip()
        if k and t: found_edges.add(f"{k}:{t}")

    # B. Multiline Callouts Parsing (WP-15 Fix)
    lines = text.split('\n')
    current_edge_type = None
    for line in lines:
        stripped = line.strip()
        callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
        if callout_match:
            current_edge_type = callout_match.group(1).strip().lower()
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links:
                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
            continue

        if current_edge_type and stripped.startswith('>'):
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links:
                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
        elif not stripped.startswith('>'):
            current_edge_type = None

    return found_edges

def _propagate_section_edges(chunks: List[Chunk], blocks: List[RawBlock]) -> List[Chunk]:
    """
    WP-15b: Implementiert Edge-Inheritance.
    Kanten aus Überschriften werden an untergeordnete Chunks vererbt.
    """
    section_inheritance: Dict[str, Set[str]] = {}

    # 1. Sammeln aus den Heading-Blöcken
    for b in blocks:
        if b.kind == "heading":
            edges = _parse_edges_robust(b.text)
            if edges:
                if b.section_path not in section_inheritance:
                    section_inheritance[b.section_path] = set()
                section_inheritance[b.section_path].update(edges)

    # 2. Injektion in den Candidate-Pool
    for ch in chunks:
        inherited = section_inheritance.get(ch.section_path, set())
        for e_str in inherited:
            kind, target = e_str.split(':', 1)
            ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "inherited"})

    return chunks

# ==========================================
# 5. ORCHESTRATION (WP-15b)
# ==========================================

async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
    """
    Hauptfunktion zur Chunk-Generierung.
    Baut den Candidate-Pool für die semantische Validierung auf.
    """
    if config is None:
        config = get_chunk_config(note_type)

    fm, body_text = extract_frontmatter_from_text(md_text)
    primary_strategy = config.get("strategy", "sliding_window")

    # 1. Parsing & Splitting
    blocks, doc_title = parse_blocks(md_text)

    if primary_strategy == "by_heading":
        chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
    else:
        chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)

    if not chunks: return []

    # 2. WP-15b: Candidate Pool Vorbereitung

    # A. Edge Inheritance (Sektions-Propagation)
    chunks = _propagate_section_edges(chunks, blocks)

    # B. Explicit Edges (Direkt im Chunk-Text enthalten)
    for ch in chunks:
        explicit = _parse_edges_robust(ch.text)
        for e_str in explicit:
            kind, target = e_str.split(':', 1)
            ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})

    # C. Global "Unassigned Pool" Detection (Safety Net)
    # Sucht nach einer Sektion "Unzugeordnete Kanten" im Body
    unassigned_pool = set()
    pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
    if pool_match:
        unassigned_pool = _parse_edges_robust(pool_match.group(1))
        for ch in chunks:
            for e_str in unassigned_pool:
                kind, target = e_str.split(':', 1)
                ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})

    # D. De-Duplikation des Pools
    for ch in chunks:
        seen = set()
        unique_pool = []
        for cand in ch.candidate_pool:
            key = (cand["kind"], cand["to"])
            if key not in seen:
                seen.add(key)
                unique_pool.append(cand)
        ch.candidate_pool = unique_pool

    # 3. Nachbarschafts-Verkettung (Struktur-Kanten)
    for i, ch in enumerate(chunks):
        ch.neighbors_prev = chunks[i-1].id if i > 0 else None
        ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None

    return chunks