Aktualisierung des Chunking-Parsers zur Einführung der Funktion parse_edges_robust zur Extraktion von Kanten-Kandidaten aus Wikilinks und Callouts. Verbesserung der Satzverarbeitung durch die Implementierung der Funktion split_sentences. Aktualisierung der Sektions-Chunking-Strategie auf Version 3.9.6 mit optimierter Handhabung von leeren Überschriften und Carry-Over Logik zur besseren Chunk-Erstellung.

2025-12-30 07:44:30 +01:00 · 2025-12-30 07:44:30 +01:00 · 06fc42ed37
commit 06fc42ed37
parent 3c5c567077
2 changed files with 80 additions and 37 deletions
--- a/app/core/chunking/chunking_parser.py
+++ b/app/core/chunking/chunking_parser.py
@ -1,13 +1,23 @@
 """
 FILE: app/core/chunking/chunking_parser.py
 DESCRIPTION: Zerlegt Markdown in atomare Blöcke. Hält H1-Überschriften im Stream
-             und gewährleistet die strukturelle Integrität von Callouts.
+             und extrahiert Kanten-Kandidaten (parse_edges_robust).
 """
 import re
 from typing import List, Tuple, Set
 from .chunking_models import RawBlock
 from .chunking_utils import extract_frontmatter_from_text

+_WS = re.compile(r'\s+')
+_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
+
+def split_sentences(text: str) -> list[str]:
+    """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
+    text = _WS.sub(' ', text.strip())
+    if not text: return []
+    # Splittet bei Punkt, Ausrufezeichen oder Fragezeichen, gefolgt von Leerzeichen und Großbuchstabe
+    return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
+
 def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    """Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6."""
    blocks = []
@ -16,17 +26,18 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
    
    # H1 für Metadaten extrahieren
    h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
-    if h1_match: h1_title = h1_match.group(1).strip()
+    if h1_match: 
+        h1_title = h1_match.group(1).strip()
    
    lines = text_without_fm.split('\n')
    buffer = []
    
    for line in lines:
        stripped = line.strip()
-        heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
        
+        # Heading-Erkennung (H1 bis H6)
+        heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
        if heading_match:
-            # Vorherigen Text-Block abschließen
            if buffer:
                content = "\n".join(buffer).strip()
                if content: 
@ -59,13 +70,37 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
            
    if buffer:
        content = "\n".join(buffer).strip()
-        if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
+        if content: 
+            blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
            
    return blocks, h1_title

-def split_sentences(text: str) -> list[str]:
-    """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
-    text = re.sub(r'\s+', ' ', text.strip())
-    if not text: return []
-    # Splittet bei Satzzeichen, gefolgt von Leerzeichen und Großbuchstaben
-    return [s.strip() for s in re.split(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])', text) if s.strip()]
+def parse_edges_robust(text: str) -> Set[str]:
+    """Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts."""
+    found_edges = set()
+    # 1. Wikilinks [[rel:kind|target]]
+    inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
+    for kind, target in inlines:
+        k = kind.strip().lower()
+        t = target.strip()
+        if k and t: found_edges.add(f"{k}:{t}")
+    
+    # 2. Callout Edges > [!edge] kind
+    lines = text.split('\n')
+    current_edge_type = None
+    for line in lines:
+        stripped = line.strip()
+        callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
+        if callout_match:
+            current_edge_type = callout_match.group(1).strip().lower()
+            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
+            for l in links: 
+                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
+            continue
+        if current_edge_type and stripped.startswith('>'):
+            links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
+            for l in links: 
+                if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
+        elif not stripped.startswith('>'): 
+            current_edge_type = None
+    return found_edges
--- a/app/core/chunking/chunking_strategies.py
+++ b/app/core/chunking/chunking_strategies.py
@ -1,7 +1,11 @@
 """
 FILE: app/core/chunking/chunking_strategies.py
-DESCRIPTION: Strategie für atomares Sektions-Chunking v3.9.5.
-             Implementiert das 'Pack-and-Carry-Over' Verfahren nach Nutzerwunsch.
+DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.6.
+             Implementiert das 'Pack-and-Carry-Over' Verfahren:
+             1. Packt ganze Abschnitte basierend auf Schätzung.
+             2. Kein physischer Overflow-Check während des Packens.
+             3. Smart-Zerlegung von Übergrößen mit Carry-Over in die Queue.
+             - Hard-Split-Logik für strict_heading_split integriert.
 """
 from typing import List, Dict, Any, Optional
 from .chunking_models import RawBlock, Chunk
@ -16,10 +20,7 @@ def _create_win(doc_title: str, sec_title: Optional[str], text: str) -> str:

 def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
    """
-    Universelle Sektions-Strategie:
-    - Smart-Edge=True: Packt Sektionen basierend auf Schätzung (Regel 1-3).
-    - Smart-Edge=False: Hard Split an Überschriften (außer leere Header).
-    - Strict=True erzwingt Hard Split Verhalten innerhalb der Smart-Logik.
+    Universelle Heading-Strategie mit Carry-Over Logik.
    """
    smart_edge = config.get("enable_smart_edge_allocation", True)
    strict = config.get("strict_heading_split", False)
@ -46,66 +47,73 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
    for b in blocks:
        if b.kind == "heading" and b.level <= split_level:
            if curr_blocks:
-                sections.append({"text": "\n\n".join([x.text for x in curr_blocks]), 
-                                 "meta": curr_blocks[0], "is_empty": len(curr_blocks) == 1})
+                sections.append({
+                    "text": "\n\n".join([x.text for x in curr_blocks]), 
+                    "meta": curr_blocks[0],
+                    "is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
+                })
            curr_blocks = [b]
-        else: curr_blocks.append(b)
+        else:
+            curr_sec_has_content = True
+            curr_blocks.append(b)
    if curr_blocks:
-        sections.append({"text": "\n\n".join([x.text for x in curr_blocks]), 
-                         "meta": curr_blocks[0], "is_empty": len(curr_blocks) == 1})
+        sections.append({
+            "text": "\n\n".join([x.text for x in curr_blocks]), 
+            "meta": curr_blocks[0],
+            "is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
+        })

    # --- SCHRITT 2: Verarbeitung der Queue ---
    queue = list(sections)
    current_chunk_text = ""
    current_meta = {"title": None, "path": "/"}
-
-    # Hard-Split-Bedingung: Entweder Smart-Edge aus ODER Profil ist Strict
+    
+    # Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
    is_hard_split_mode = (not smart_edge) or (strict)

    while queue:
        item = queue.pop(0)
        item_text = item["text"]
        
-        # Initialisierung für neuen Chunk
        if not current_chunk_text:
            current_meta["title"] = item["meta"].section_title
            current_meta["path"] = item["meta"].section_path

-        # FALL A: Hard Split Modus (Regel: Trenne bei jeder Sektion <= Level)
+        # FALL A: HARD SPLIT MODUS
        if is_hard_split_mode:
-            # Regel: Leere Überschriften verbleiben am nächsten Chunk
+            # Leere Überschriften (H1 vor H2) werden mit dem nächsten Item verschmolzen
            if item.get("is_empty", False) and queue:
                current_chunk_text = (current_chunk_text + "\n\n" + item_text).strip()
-                continue # Nimm das nächste Item dazu
+                continue 
            
            combined = (current_chunk_text + "\n\n" + item_text).strip()
+            # Wenn durch das Verschmelzen das Limit gesprengt würde, flashen wir vorher
            if estimate_tokens(combined) > max_tokens and current_chunk_text:
-                # Falls es trotz Hard-Split zu groß wird, flashen wir erst den alten Teil
                _emit(current_chunk_text, current_meta["title"], current_meta["path"])
                current_chunk_text = item_text
            else:
                current_chunk_text = combined
            
-            # Im Hard Split flashen wir nach jeder Sektion, die nicht leer ist
+            # Im Hard-Split wird nach jeder nicht-leeren Sektion geflasht
            _emit(current_chunk_text, current_meta["title"], current_meta["path"])
            current_chunk_text = ""
            continue

-        # FALL B: Smart Mode (Regel 1-3)
+        # FALL B: SMART MODE (Regel 1-3)
        combined_text = (current_chunk_text + "\n\n" + item_text).strip() if current_chunk_text else item_text
        combined_est = estimate_tokens(combined_text)

        if combined_est <= max_tokens:
-            # Regel 1 & 2: Passt nach Schätzung -> Aufnehmen
+            # Regel 1 & 2: Passt rein -> Aufnehmen
            current_chunk_text = combined_text
        else:
-            # Regel 3: Passt nicht -> Entweder Puffer flashen oder Item zerlegen
            if current_chunk_text:
+                # Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
                _emit(current_chunk_text, current_meta["title"], current_meta["path"])
                current_chunk_text = ""
-                queue.insert(0, item) # Item für neuen Chunk zurücklegen
+                queue.insert(0, item)
            else:
-                # Einzelne Sektion zu groß -> Smart Zerlegung
+                # Regel 3: Einzelne Sektion zu groß -> Smart Zerlegung
                sents = split_sentences(item_text)
                header_prefix = item["meta"].text if item["meta"].kind == "heading" else ""
                
@ -119,11 +127,11 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
                
                _emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
                
-                # Carry-Over: Rest wird vorne in die Queue geschoben
                if sents:
                    remainder = " ".join(sents)
                    if header_prefix and not remainder.startswith(header_prefix):
                        remainder = header_prefix + "\n\n" + remainder
+                    # Carry-Over: Rest wird vorne in die Queue geschoben
                    queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True})

    if current_chunk_text:
@ -132,7 +140,7 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
    return chunks

 def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
-    """Standard Sliding Window Strategie."""
+    """Basis-Sliding-Window für flache Texte."""
    target = config.get("target", 400); max_tokens = config.get("max", 600)
    chunks: List[Chunk] = []; buf: List[RawBlock] = []
    for b in blocks: