mindnet/app/core/chunker.py

"""
FILE: app/core/chunker.py
DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher.
             WP-14: Modularisierung abgeschlossen.
             WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert.
             Verwendet neue 'chunking_' Präfixe für Untermodule.
VERSION: 3.3.0
STATUS: Active
"""
import asyncio
import re
import logging
from typing import List, Dict, Optional

# Interne Package-Imports mit neuer Präfix-Konvention
from .chunking.chunking_models import Chunk, RawBlock
from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text
from .chunking.chunking_parser import parse_blocks, parse_edges_robust
from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading
from .chunking.chunking_propagation import propagate_section_edges

logger = logging.getLogger(__name__)

# Legacy Support für SemanticAnalyzer (Optional für andere Skripte)
try:
    from app.services.semantic_analyzer import get_semantic_analyzer
except ImportError:
    def get_semantic_analyzer(): return None

async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
    """
    Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten.
    Sichert die Kompatibilität zum bestehenden Ingestion-Prozess.
    """
    if config is None:
        config = get_chunk_config(note_type)

    fm, body_text = extract_frontmatter_from_text(md_text)
    primary_strategy = config.get("strategy", "sliding_window")

    # 1. Parsing
    blocks, doc_title = parse_blocks(md_text)

    # 2. Splitting via Thread-Offloading
    if primary_strategy == "by_heading":
        chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
    else:
        chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)

    if not chunks: return []

    # 3. WP-15b: Candidate Pool Vorbereitung
    # A. Edge Inheritance (Sektions-Propagation)
    chunks = propagate_section_edges(chunks, blocks)

    # B. Explicit Edges (Direkt im Chunk-Text)
    for ch in chunks:
        explicit = parse_edges_robust(ch.text)
        for e_str in explicit:
            kind, target = e_str.split(':', 1)
            ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})

    # C. Global Pool Detection (Sektion 'Unzugeordnete Kanten')
    pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
    if pool_match:
        unassigned = parse_edges_robust(pool_match.group(1))
        for ch in chunks:
            for e_str in unassigned:
                kind, target = e_str.split(':', 1)
                ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})

    # D. Eindeutigkeit sicherstellen
    for ch in chunks:
        seen = set(); unique_pool = []
        for cand in ch.candidate_pool:
            key = (cand["kind"], cand["to"])
            if key not in seen:
                seen.add(key); unique_pool.append(cand)
        ch.candidate_pool = unique_pool

    # 4. Graph-Struktur (Nachbarschaft)
    for i, ch in enumerate(chunks):
        ch.neighbors_prev = chunks[i-1].id if i > 0 else None
        ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None

    return chunks