Enhance chunking system with WP-24c v4.2.6 and v4.2.7 updates: Introduce is_meta_content flag for callouts in RawBlock, ensuring they are chunked but later removed for clean context. Update parse_blocks and propagate_section_edges to handle callout edges with explicit provenance for chunk attribution. Implement clean-context logic to remove callout syntax post-processing, maintaining chunk integrity. Adjust get_chunk_config to prioritize frontmatter overrides for chunking profiles. Update documentation to reflect these changes.

2026-01-11 11:14:31 +01:00 · 2026-01-11 11:14:31 +01:00 · 55b64c331a
commit 55b64c331a
parent 4d43cc526e
8 changed files with 231 additions and 43 deletions
--- a/app/core/chunking/chunking_models.py
+++ b/app/core/chunking/chunking_models.py
@ -14,6 +14,7 @@ class RawBlock:
    section_path: str
    section_title: Optional[str]
    exclude_from_chunking: bool = False  # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
    is_meta_content: bool = False  # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
@dataclass
 class Chunk:
--- a/app/core/chunking/chunking_parser.py
+++ b/app/core/chunking/chunking_parser.py
@ -4,10 +4,11 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
             Hält alle Überschriftenebenen (H1-H6) im Stream.
             Stellt die Funktion parse_edges_robust zur Verfügung.
             WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
             WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
 """
 import re
 import os
-from typing import List, Tuple, Set
+from typing import List, Tuple, Set, Dict, Any
 from .chunking_models import RawBlock
 from .chunking_utils import extract_frontmatter_from_text
@ -25,6 +26,7 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """
    Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
    WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
    WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
    """
    blocks = []
    h1_title = "Dokument"
@ -67,9 +69,61 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    lines = text_without_fm.split('\n')
    buffer = []
-    for line in lines:
+    # WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
    # Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >)
    callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
    # WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
    processed_indices = set()
    for i, line in enumerate(lines):
        if i in processed_indices:
            continue
        stripped = line.strip()
        # WP-24c v4.2.5: Callout-Erkennung (VOR Heading-Erkennung)
        # Prüfe, ob diese Zeile ein Callout startet
        callout_match = callout_pattern.match(line)
        if callout_match:
            # Vorherigen Text-Block abschließen
            if buffer:
                content = "\n".join(buffer).strip()
                if content: 
                    blocks.append(RawBlock(
                        "paragraph", content, None, section_path, current_section_title,
                        exclude_from_chunking=in_exclusion_zone
                    ))
                buffer = []
            # Sammle alle Zeilen des Callout-Blocks
            callout_lines = [line]
            leading_gt_count = len(line) - len(line.lstrip('>'))
            processed_indices.add(i)
            # Sammle alle Zeilen, die zum Callout gehören (gleiche oder höhere Einrückung)
            j = i + 1
            while j < len(lines):
                next_line = lines[j]
                if not next_line.strip().startswith('>'):
                    break
                next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
                if next_leading_gt < leading_gt_count:
                    break
                callout_lines.append(next_line)
                processed_indices.add(j)
                j += 1
            # WP-24c v4.2.6: Erstelle Callout-Block mit is_meta_content = True
            # Callouts werden gechunkt (für Chunk-Attribution), aber später entfernt (Clean-Context)
            callout_content = "\n".join(callout_lines)
            blocks.append(RawBlock(
                "callout", callout_content, None, section_path, current_section_title,
                exclude_from_chunking=in_exclusion_zone,  # Nur Edge-Zonen werden ausgeschlossen
                is_meta_content=True  # WP-24c v4.2.6: Markierung für spätere Entfernung
            ))
            continue
        # Heading-Erkennung (H1 bis H6)
        heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
        if heading_match:
@ -148,15 +202,22 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    return blocks, h1_title
-def parse_edges_robust(text: str) -> Set[str]:
+def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
-    """Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts."""
+    """
-    found_edges = set()
+    Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
    WP-24c v4.2.7: Gibt Liste von Dicts zurück mit is_callout Flag für Chunk-Attribution.
    Returns:
        List[Dict] mit keys: "edge" (str: "kind:target"), "is_callout" (bool)
    """
    found_edges: List[Dict[str, any]] = []
    # 1. Wikilinks [[rel:kind|target]]
    inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
    for kind, target in inlines:
        k = kind.strip().lower()
        t = target.strip()
-        if k and t: found_edges.add(f"{k}:{t}")
+        if k and t:
            found_edges.append({"edge": f"{k}:{t}", "is_callout": False})
    # 2. Callout Edges > [!edge] kind
    lines = text.split('\n')
@ -169,13 +230,15 @@ def parse_edges_robust(text: str) -> Set[str]:
            # Links in der gleichen Zeile des Callouts
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links: 
-                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
+                if "rel:" not in l:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
            continue
        # Links in Folgezeilen des Callouts
        if current_edge_type and stripped.startswith('>'):
            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
            for l in links: 
-                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
+                if "rel:" not in l:
                    found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
        elif not stripped.startswith('>'): 
            current_edge_type = None
    return found_edges
--- a/app/core/chunking/chunking_processor.py
+++ b/app/core/chunking/chunking_processor.py
@ -7,6 +7,16 @@ DESCRIPTION: Der zentrale Orchestrator für das Chunking-System.
             - Stellt H1-Kontext-Fenster sicher.
             - Baut den Candidate-Pool für die WP-15b Ingestion auf.
             WP-24c v4.2.0: Konfigurierbare Header-Namen für LLM-Validierung.
             WP-24c v4.2.5: Wiederherstellung der Chunking-Präzision
             - Frontmatter-Override für chunking_profile
             - Callout-Exclusion aus Chunks
             - Strict-Mode ohne Carry-Over
             WP-24c v4.2.6: Finale Härtung - "Semantic First, Clean Second"
             - Callouts werden gechunkt (Chunk-Attribution), aber später entfernt (Clean-Context)
             - remove_callouts_from_text erst nach propagate_section_edges und Candidate Pool
             WP-24c v4.2.7: Wiederherstellung der Chunk-Attribution
             - Callout-Kanten erhalten explicit:callout Provenance im candidate_pool
             - graph_derive_edges.py erkennt diese und verhindert Note-Scope Duplikate
 """
 import asyncio
 import re
@ -25,16 +35,19 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
    """
    Hauptfunktion zur Zerlegung einer Note. 
    Verbindet Strategien mit physikalischer Kontext-Anreicherung.
    WP-24c v4.2.5: Frontmatter-Override für chunking_profile wird berücksichtigt.
    """
-    # 1. Konfiguration & Parsing
+    # 1. WP-24c v4.2.5: Frontmatter VOR Konfiguration extrahieren (für Override)
    if config is None: 
        config = get_chunk_config(note_type)
    fm, body_text = extract_frontmatter_from_text(md_text)
    # 2. Konfiguration mit Frontmatter-Override
    if config is None: 
        config = get_chunk_config(note_type, frontmatter=fm)
    blocks, doc_title = parse_blocks(md_text)
-    # WP-24c v4.2.0: Filtere Blöcke aus Edge-Zonen (LLM-Validierung & Note-Scope)
+    # WP-24c v4.2.6: Filtere NUR Edge-Zonen (LLM-Validierung & Note-Scope)
-    # Diese Bereiche sollen nicht als Chunks angelegt werden, sondern nur die Kanten extrahiert werden
+    # Callouts (is_meta_content=True) müssen durch, damit Chunk-Attribution erhalten bleibt
    blocks_for_chunking = [b for b in blocks if not getattr(b, 'exclude_from_chunking', False)]
    # Vorbereitung des H1-Präfix für die Embedding-Fenster (Breadcrumbs)
@ -42,6 +55,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
    # 2. Anwendung der Splitting-Strategie
    # Alle Strategien nutzen nun einheitlich context_prefix für die Window-Bildung.
    # WP-24c v4.2.6: Callouts sind in blocks_for_chunking enthalten (für Chunk-Attribution)
    if config.get("strategy") == "by_heading":
        chunks = await asyncio.to_thread(
            strategy_by_heading, blocks_for_chunking, config, note_id, context_prefix=h1_prefix
@ -55,21 +69,27 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
        return []
    # 3. Physikalische Kontext-Anreicherung (Der Qualitäts-Fix)
    # WP-24c v4.2.6: Arbeite auf Original-Text inkl. Callouts (für korrekte Chunk-Attribution)
    # Schreibt Kanten aus Callouts/Inlines hart in den Text für Qdrant.
    chunks = propagate_section_edges(chunks)
-    # 4. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
+    # 5. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
    # WP-24c v4.2.7: Markiere Callout-Kanten explizit für Chunk-Attribution
    # Zuerst die explizit im Text vorhandenen Kanten sammeln.
    for ch in chunks:
        # Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
        # ch.candidate_pool wird im Modell-Konstruktor als leere Liste initialisiert.
-        for e_str in parse_edges_robust(ch.text):
+        for edge_info in parse_edges_robust(ch.text):
-            parts = e_str.split(':', 1)
+            edge_str = edge_info["edge"]
            is_callout = edge_info.get("is_callout", False)
            parts = edge_str.split(':', 1)
            if len(parts) == 2:
                k, t = parts
-                ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"})
+                # WP-24c v4.2.7: Callout-Kanten erhalten explicit:callout Provenance
                provenance = "explicit:callout" if is_callout else "explicit"
                ch.candidate_pool.append({"kind": k, "to": t, "provenance": provenance})
-    # 5. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
+    # 6. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
    # WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebene via .env
    # Sucht nach ALLEN Edge-Pool Blöcken im Original-Markdown (nicht nur am Ende).
    llm_validation_headers = os.getenv(
@ -93,15 +113,16 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
    for pool_match in re.finditer(zone_pattern, body_text, re.DOTALL | re.IGNORECASE | re.MULTILINE):
        global_edges = parse_edges_robust(pool_match.group(1))
-        for e_str in global_edges:
+        for edge_info in global_edges:
-            parts = e_str.split(':', 1)
+            edge_str = edge_info["edge"]
            parts = edge_str.split(':', 1)
            if len(parts) == 2:
                k, t = parts
                # Diese Kanten werden als "global_pool" markiert für die spätere KI-Prüfung.
                for ch in chunks: 
                    ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
-    # 6. De-Duplikation des Pools & Linking
+    # 7. De-Duplikation des Pools & Linking
    for ch in chunks:
        seen = set()
        unique = []
@ -113,6 +134,54 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
                unique.append(c)
        ch.candidate_pool = unique
    # 8. WP-24c v4.2.6: Clean-Context - Entferne Callout-Syntax aus Chunk-Text
    # WICHTIG: Dies geschieht NACH propagate_section_edges und Candidate Pool Aufbau,
    # damit Chunk-Attribution erhalten bleibt und Kanten korrekt extrahiert werden.
    # Hinweis: Callouts können mehrzeilig sein (auch verschachtelt: >>)
    def remove_callouts_from_text(text: str) -> str:
        """Entfernt alle Callout-Zeilen (> [!edge] oder > [!abstract]) aus dem Text."""
        if not text:
            return text
        lines = text.split('\n')
        cleaned_lines = []
        i = 0
        callout_start_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
        while i < len(lines):
            line = lines[i]
            callout_match = callout_start_pattern.match(line)
            if callout_match:
                # Callout gefunden: Überspringe alle Zeilen des Callout-Blocks
                leading_gt_count = len(line) - len(line.lstrip('>'))
                i += 1
                # Überspringe alle Zeilen, die zum Callout gehören
                while i < len(lines):
                    next_line = lines[i]
                    if not next_line.strip().startswith('>'):
                        break
                    next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
                    if next_leading_gt < leading_gt_count:
                        break
                    i += 1
            else:
                # Normale Zeile: Behalte
                cleaned_lines.append(line)
                i += 1
        # Normalisiere Leerzeilen (max. 2 aufeinanderfolgende)
        result = '\n'.join(cleaned_lines)
        result = re.sub(r'\n\s*\n\s*\n+', '\n\n', result)
        return result
    for ch in chunks:
        ch.text = remove_callouts_from_text(ch.text)
        if ch.window:
            ch.window = remove_callouts_from_text(ch.window)
    # Verknüpfung der Nachbarschaften für Graph-Traversierung
    for i, ch in enumerate(chunks):
        ch.neighbors_prev = chunks[i-1].id if i > 0 else None
--- a/app/core/chunking/chunking_propagation.py
+++ b/app/core/chunking/chunking_propagation.py
@ -22,11 +22,13 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
            continue
        # Nutzt den robusten Parser aus dem Package
-        edges = parse_edges_robust(ch.text)
+        # WP-24c v4.2.7: parse_edges_robust gibt jetzt Liste von Dicts zurück
-        if edges:
+        edge_infos = parse_edges_robust(ch.text)
        if edge_infos:
            if ch.section_path not in section_map:
                section_map[ch.section_path] = set()
-            section_map[ch.section_path].update(edges)
+            for edge_info in edge_infos:
                section_map[ch.section_path].add(edge_info["edge"])
    # 2. Injizieren: Kanten in jeden Chunk der Sektion zurückschreiben (Broadcasting)
    for ch in chunks:
@ -37,7 +39,9 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
            # Vorhandene Kanten (Typ:Ziel) in DIESEM Chunk ermitteln, 
            # um Dopplungen (z.B. durch Callouts) zu vermeiden.
-            existing_edges = parse_edges_robust(ch.text)
+            # WP-24c v4.2.7: parse_edges_robust gibt jetzt Liste von Dicts zurück
            existing_edge_infos = parse_edges_robust(ch.text)
            existing_edges = {ei["edge"] for ei in existing_edge_infos}
            injections = []
            # Sortierung für deterministische Ergebnisse
--- a/app/core/chunking/chunking_strategies.py
+++ b/app/core/chunking/chunking_strategies.py
@ -5,6 +5,7 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
             - Keine redundante Kanten-Injektion.
             - Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
             - Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
             WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
 """
 from typing import List, Dict, Any, Optional
 from .chunking_models import RawBlock, Chunk
@ -83,23 +84,46 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
            current_meta["title"] = item["meta"].section_title
            current_meta["path"] = item["meta"].section_path
-        # FALL A: HARD SPLIT MODUS
+        # FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
        if is_hard_split_mode:
-            # Leere Überschriften (z.B. H1 direkt vor H2) verbleiben am nächsten Chunk
+            # WP-24c v4.2.5: Bei strict_heading_split: true wird nach JEDER Sektion geflasht
-            if item.get("is_empty", False) and queue:
+            # Kein Carry-Over erlaubt, auch nicht für leere Überschriften
-                current_chunk_text = (current_chunk_text + "\n\n" + item_text).strip()
+            if current_chunk_text:
-                continue 
+                # Flashe vorherigen Chunk
            combined = (current_chunk_text + "\n\n" + item_text).strip()
            # Wenn durch Verschmelzung das Limit gesprengt würde, vorher flashen
            if estimate_tokens(combined) > max_tokens and current_chunk_text:
                _emit(current_chunk_text, current_meta["title"], current_meta["path"])
-                current_chunk_text = item_text
+                current_chunk_text = ""
            # Neue Sektion: Initialisiere Meta
            current_meta["title"] = item["meta"].section_title
            current_meta["path"] = item["meta"].section_path
            # WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
            # (nur Überschrift, kein Inhalt)
            if item.get("is_empty", False):
                # Leere Sektion: Nur Überschrift als Chunk
                _emit(item_text, current_meta["title"], current_meta["path"])
            else:
-                current_chunk_text = combined
+                # Normale Sektion: Prüfe auf Token-Limit
                if estimate_tokens(item_text) > max_tokens:
                    # Sektion zu groß: Smart Zerlegung (aber trotzdem in separaten Chunks)
                    sents = split_sentences(item_text)
                    header_prefix = item["meta"].text if item["meta"].kind == "heading" else ""
                    take_sents = []; take_len = 0
                    while sents:
                        s = sents.pop(0); slen = estimate_tokens(s)
                        if take_len + slen > target and take_sents:
                            _emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
                            take_sents = [s]; take_len = slen
                        else:
                            take_sents.append(s); take_len += slen
                    if take_sents:
                        _emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
                else:
                    # Sektion passt: Direkt als Chunk
                    _emit(item_text, current_meta["title"], current_meta["path"])
            # Im Hard-Split wird nach jeder Sektion geflasht
            _emit(current_chunk_text, current_meta["title"], current_meta["path"])
            current_chunk_text = ""
            continue
--- a/app/core/chunking/chunking_utils.py
+++ b/app/core/chunking/chunking_utils.py
@ -27,12 +27,31 @@ def load_yaml_config() -> Dict[str, Any]:
            return data
    except Exception: return {}
-def get_chunk_config(note_type: str) -> Dict[str, Any]:
+def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
-    """Lädt die Chunking-Strategie basierend auf dem Note-Type."""
+    """
    Lädt die Chunking-Strategie basierend auf dem Note-Type.
    WP-24c v4.2.5: Frontmatter-Override für chunking_profile hat höchste Priorität.
    Args:
        note_type: Der Typ der Note (z.B. "decision", "experience")
        frontmatter: Optionales Frontmatter-Dict mit chunking_profile Override
    Returns:
        Dict mit Chunking-Konfiguration
    """
    full_config = load_yaml_config()
    profiles = full_config.get("chunking_profiles", {})
    type_def = full_config.get("types", {}).get(note_type.lower(), {})
-    profile_name = type_def.get("chunking_profile") or full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
+    
    # WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
    profile_name = None
    if frontmatter and "chunking_profile" in frontmatter:
        profile_name = frontmatter.get("chunking_profile")
    if not profile_name:
        profile_name = type_def.get("chunking_profile")
    if not profile_name:
        profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
    config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
    if "overlap" in config and isinstance(config["overlap"], list): 
        config["overlap"] = tuple(config["overlap"])
--- a/app/core/graph/graph_derive_edges.py
+++ b/app/core/graph/graph_derive_edges.py
@ -209,6 +209,7 @@ def build_edges_for_note(
    """
    Erzeugt und aggregiert alle Kanten für eine Note.
    WP-24c v4.2.0: Unterstützt Note-Scope Extraktions-Zonen.
    WP-24c v4.2.7: Chunk-Attribution für Callouts über candidate_pool mit explicit:callout Provenance.
    Args:
        note_id: ID der Note
@ -313,11 +314,17 @@ def build_edges_for_note(
            edges.append(_edge(k, "chunk", cid, t, note_id, payload))
        # B. Candidate Pool (WP-15b Validierte KI-Kanten)
        # WP-24c v4.2.7: Sammle Callout-Keys für Chunk-Attribution
        pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
        for cand in pool:
            raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
            t, sec = parse_link_target(raw_t, note_id)
            if t:
                # WP-24c v4.2.7: Wenn Provenance explicit:callout, füge zu all_chunk_callout_keys hinzu
                # Dadurch weiß die globale Extraktion, dass diese Kante bereits auf Chunk-Ebene versorgt ist
                if p == "explicit:callout":
                    all_chunk_callout_keys.add((k, t, sec))
                # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
                payload = {
                    "chunk_id": cid, 
--- a/app/core/graph/graph_utils.py
+++ b/app/core/graph/graph_utils.py
@ -24,6 +24,7 @@ PROVENANCE_PRIORITY = {
    "explicit:wikilink": 1.00,
    "inline:rel": 0.95,
    "callout:edge": 0.90,
    "explicit:callout": 0.90,  # WP-24c v4.2.7: Callout-Kanten aus candidate_pool
    "semantic_ai": 0.90,           # Validierte KI-Kanten
    "structure:belongs_to": 1.00,
    "structure:order": 0.95,       # next/prev