Implement backward propagation of section_type in chunking parser

- Added a new function `_propagate_section_type_backwards` to ensure that the section_type is correctly assigned to all blocks within a heading section, even if the [!section] callout appears later in the text. - Updated the `parse_blocks` function to call this new method, enhancing the accuracy of section-type assignments. - Modified chunking strategies to reflect the changes in section-type handling, simplifying logic related to section-type transitions. - Expanded unit tests to validate the backward propagation of section_type, ensuring comprehensive coverage of the new functionality.
2026-01-25 18:21:17 +01:00 · 2026-01-25 18:21:17 +01:00 · e86e9f2313
commit e86e9f2313
parent 553a41df57
3 changed files with 147 additions and 48 deletions
--- a/app/core/chunking/chunking_parser.py
+++ b/app/core/chunking/chunking_parser.py
@ -273,9 +273,66 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
                section_type=current_section_type,
                block_id=current_block_id
            ))
+    
+    # WP-26 v1.3: Post-Processing - Section-Type rückwirkend setzen
+    # Der [!section] Callout kann IRGENDWO im Abschnitt stehen und gilt rückwirkend
+    # für die gesamte Heading-Sektion (vom Heading bis zum nächsten Heading gleicher/höherer Ebene)
+    blocks = _propagate_section_type_backwards(blocks, split_level=2)
            
    return blocks, h1_title

+
+def _propagate_section_type_backwards(blocks: List[RawBlock], split_level: int = 2) -> List[RawBlock]:
+    """
+    WP-26 v1.3: Propagiert section_type rückwirkend für Heading-Sektionen.
+    
+    Der [!section] Callout kann irgendwo im Abschnitt stehen (nicht nur direkt nach dem Heading).
+    Diese Funktion findet den section_type innerhalb einer Heading-Sektion und setzt ihn
+    rückwirkend für ALLE Blöcke dieser Sektion (inklusive dem Heading selbst).
+    
+    Args:
+        blocks: Liste von RawBlock-Objekten
+        split_level: Heading-Ebene, die eine neue Sektion startet (Standard: 2 für H2)
+    
+    Returns:
+        Liste von RawBlock-Objekten mit korrigiertem section_type
+    """
+    if not blocks:
+        return blocks
+    
+    # Gruppiere Blöcke nach Heading-Sektionen
+    sections: List[List[int]] = []  # Liste von Index-Listen
+    current_section_indices: List[int] = []
+    
+    for idx, block in enumerate(blocks):
+        if block.kind == "heading" and block.level is not None and block.level <= split_level:
+            # Neues Heading startet neue Sektion
+            if current_section_indices:
+                sections.append(current_section_indices)
+            current_section_indices = [idx]
+        else:
+            current_section_indices.append(idx)
+    
+    # Letzte Sektion hinzufügen
+    if current_section_indices:
+        sections.append(current_section_indices)
+    
+    # Für jede Sektion: Finde den section_type und setze ihn rückwirkend
+    for section_indices in sections:
+        # Finde den section_type innerhalb dieser Sektion
+        section_type_found = None
+        for idx in section_indices:
+            if blocks[idx].section_type:
+                section_type_found = blocks[idx].section_type
+                break  # Erster gefundener section_type gewinnt
+        
+        # Wenn ein section_type gefunden wurde, setze ihn für alle Blöcke der Sektion
+        if section_type_found:
+            for idx in section_indices:
+                blocks[idx].section_type = section_type_found
+    
+    return blocks
+
 def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
    """
    Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
--- a/app/core/chunking/chunking_strategies.py
+++ b/app/core/chunking/chunking_strategies.py
@ -8,7 +8,9 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
             WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
             WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
             WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER einen neuen Chunk (unabhängig vom Profil).
-                         Gleiche section_types folgen dem normalen Chunking-Verhalten.
+             WP-26 v1.3: Parser propagiert section_type rückwirkend für Heading-Sektionen.
+                         Der [!section] Callout kann irgendwo im Abschnitt stehen.
+                         Alle Blöcke einer Heading-Sektion haben den korrekten section_type.
 """
 from typing import List, Dict, Any, Optional
 from .chunking_models import RawBlock, Chunk
@ -54,26 +56,20 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
        ))

    # --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
-    # WP-26 v1.0: Erweitert um section_type und block_id Tracking
-    # WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER eine neue Sektion (unabhängig vom Profil)
+    # WP-26 v1.3: Der Parser propagiert section_type bereits rückwirkend für Heading-Sektionen.
+    # Alle Blöcke einer Heading-Sektion (inkl. Heading selbst) haben bereits den korrekten section_type.
    sections: List[Dict[str, Any]] = []
    curr_blocks = []
-    current_section_type = None  # WP-26 v1.1: Tracking des aktuellen section_type
    
    def _flush_section():
        """Hilfsfunktion zum Abschließen einer Sektion."""
-        nonlocal curr_blocks, current_section_type
+        nonlocal curr_blocks
        if not curr_blocks:
            return
-        # WP-26 v1.0: Finde den effektiven section_type und block_id für diese Sektion
-        # Priorisiere den ersten Block mit section_type, sonst den Heading-Block
-        effective_section_type = None
-        effective_block_id = None
-        for cb in curr_blocks:
-            if cb.section_type and effective_section_type is None:
-                effective_section_type = cb.section_type
-            if cb.block_id and effective_block_id is None:
-                effective_block_id = cb.block_id
+        # WP-26 v1.3: section_type wird vom Parser bereits korrekt gesetzt (rückwirkend)
+        # Alle Blöcke einer Heading-Sektion haben denselben section_type
+        effective_section_type = next((cb.section_type for cb in curr_blocks if cb.section_type), None)
+        effective_block_id = next((cb.block_id for cb in curr_blocks if cb.block_id), None)
        
        sections.append({
            "text": "\n\n".join([x.text for x in curr_blocks]), 
@ -83,39 +79,15 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
            "block_id": effective_block_id
        })
        curr_blocks = []
-        current_section_type = effective_section_type
    
    for b in blocks:
-        # WP-26 v1.1: Prüfe auf Section-Type-Wechsel BEVOR wir den Block hinzufügen
-        # Wenn sich der section_type ändert, muss die aktuelle Sektion abgeschlossen werden
-        block_section_type = b.section_type
-        
-        # Section-Type-Wechsel erkannt?
-        # (Wechsel ist: alter Typ != neuer Typ UND mindestens einer ist nicht None)
-        is_section_type_change = (
-            curr_blocks and  # Es gibt bereits Blöcke
-            block_section_type is not None and  # Neuer Block hat expliziten section_type
-            current_section_type != block_section_type  # Typ hat sich geändert
-        )
-        
        if b.kind == "heading" and b.level <= split_level:
-            # Heading-basierter Split (Standard-Verhalten)
+            # Heading-basierter Split
            _flush_section()
            curr_blocks = [b]
-            current_section_type = block_section_type  # Update tracking
-        elif is_section_type_change:
-            # WP-26 v1.1: Section-Type-Wechsel erzwingt neuen Chunk
-            _flush_section()
-            curr_blocks = [b]
-            current_section_type = block_section_type
        else:
-            # Normales Hinzufügen zum aktuellen Block
            curr_blocks.append(b)
-            # Update section_type wenn Block einen hat und wir noch keinen haben
-            if block_section_type and not current_section_type:
-                current_section_type = block_section_type
    
-    # Letzte Sektion abschließen
    _flush_section()

    # --- SCHRITT 2: Verarbeitung der Queue ---
@ -269,11 +241,11 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
    """
    Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
    WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
-    WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER einen neuen Chunk.
+    WP-26 v1.3: Parser propagiert section_type rückwirkend - vereinfachte Logik.
    """
    target = config.get("target", 400); max_tokens = config.get("max", 600)
    chunks: List[Chunk] = []; buf: List[RawBlock] = []
-    current_section_type = None  # WP-26 v1.1: Tracking des aktuellen section_type
+    current_section_type = None  # Tracking des aktuellen section_type
    
    def _flush_buffer():
        """Hilfsfunktion zum Flushen des Buffers."""
@ -282,7 +254,7 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
            return
        txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
        win = _create_win(context_prefix, buf[0].section_title, txt)
-        # WP-26 v1.0: Finde effektiven section_type und block_id
+        # WP-26 v1.3: section_type wird bereits vom Parser rückwirkend propagiert
        effective_section_type = next((b.section_type for b in buf if b.section_type), None)
        effective_block_id = next((b.block_id for b in buf if b.block_id), None)
        chunks.append(Chunk(
@ -300,10 +272,11 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
        curr_tokens = sum(estimate_tokens(x.text) for x in buf) if buf else 0
        block_section_type = b.section_type
        
-        # WP-26 v1.1: Prüfe auf Section-Type-Wechsel
+        # WP-26 v1.3: Prüfe auf Section-Type-Wechsel
+        # Da der Parser section_type rückwirkend setzt, haben alle Blöcke einer
+        # Heading-Sektion denselben section_type. Ein Wechsel bedeutet neue Sektion.
        is_section_type_change = (
            buf and  # Es gibt bereits Blöcke im Buffer
-            block_section_type is not None and  # Neuer Block hat expliziten section_type
            current_section_type != block_section_type  # Typ hat sich geändert
        )
        
@ -312,8 +285,8 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
            _flush_buffer()
        
        buf.append(b)
-        # Update section_type wenn Block einen hat und wir noch keinen haben
-        if block_section_type and not current_section_type:
+        # Update section_type
+        if block_section_type:
            current_section_type = block_section_type
    
    # Letzten Buffer flushen
--- a/tests/test_wp26_section_types.py
+++ b/tests/test_wp26_section_types.py
@ -2,7 +2,8 @@
 FILE: tests/test_wp26_section_types.py
 DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
             WP-26 v1.1: Erweitert um Tests für Section-Split und automatische Edges
-VERSION: 1.1.0
+             WP-26 v1.3: Erweitert um Tests für rückwirkende section_type Propagation
+VERSION: 1.3.0
 """
 import pytest
 from app.core.chunking.chunking_parser import parse_blocks
@ -50,6 +51,36 @@ Die Geschichte beginnt hier.
        
        # Block-ID sollte "sit" sein
        assert heading_blocks[0].block_id == "sit"
+    
+    def test_section_type_propagated_backwards_to_heading(self):
+        """WP-26 v1.3: section_type wird rückwirkend auf das Heading propagiert."""
+        md = """
+## Lektion ^learning
+
+Einleitender Text ohne section callout.
+
+Noch mehr Text hier...
+
+> [!section] insight
+
+Und dann kommt der eigentliche Insight-Content.
+"""
+        blocks, _ = parse_blocks(md)
+        
+        # Finde den Heading-Block
+        heading_blocks = [b for b in blocks if b.kind == "heading"]
+        assert len(heading_blocks) >= 1
+        
+        # WP-26 v1.3: Das Heading sollte section_type "insight" haben
+        # (rückwirkend propagiert, obwohl [!section] später im Abschnitt steht)
+        assert heading_blocks[0].section_type == "insight", \
+            f"Heading sollte section_type 'insight' haben, hat aber: {heading_blocks[0].section_type}"
+        
+        # Alle Paragraphen in dieser Sektion sollten auch section_type "insight" haben
+        paragraphs = [b for b in blocks if b.kind == "paragraph"]
+        for p in paragraphs:
+            assert p.section_type == "insight", \
+                f"Paragraph sollte section_type 'insight' haben: {p.text[:50]}"


 class TestSectionTypeScope:
@ -475,8 +506,10 @@ Meine positive Einstellung hat mir geholfen.
        # Mindestens ein Chunk mit section_type "insight"
        assert len(insight_chunks) >= 1
        
-        # Der insight-Chunk sollte den Lektions-Inhalt enthalten
+        # WP-26 v1.2: Der insight-Chunk MUSS die Überschrift "💡 Lektion" enthalten!
+        # (Nicht nur den Inhalt nach dem [!section] Callout)
        insight_text = insight_chunks[0].text
+        assert "Lektion" in insight_text, f"Überschrift '💡 Lektion' fehlt im insight-Chunk: {insight_text[:100]}"
        assert "durchdenken" in insight_text.lower() or "positive" in insight_text.lower()
    
    def test_section_type_change_in_smart_mode_forces_split(self):
@ -516,6 +549,42 @@ Inhalt C mit section_type "insight".
        # Der letzte Chunk sollte section_type "insight" haben
        insight_chunks = [c for c in chunks if c.section_type == "insight"]
        assert len(insight_chunks) >= 1, "Kein Chunk mit section_type 'insight' gefunden"
+    
+    def test_heading_belongs_to_new_section_with_section_type(self):
+        """WP-26 v1.2: Heading gehört zur neuen Sektion wenn [!section] folgt"""
+        md = """
+## Section A
+
+Inhalt ohne section_type.
+
+## Section B mit Typ
+> [!section] insight
+
+Inhalt mit section_type.
+"""
+        blocks, _ = parse_blocks(md)
+        
+        config = {
+            "target": 2000,
+            "max": 4000,
+            "split_level": 2,
+            "strict_heading_split": False,
+            "enable_smart_edge_allocation": True
+        }
+        
+        chunks = strategy_by_heading(blocks, config, "test-note")
+        
+        # Es sollten 2 Chunks geben
+        assert len(chunks) == 2, f"Erwartet 2 Chunks, bekommen: {len(chunks)}"
+        
+        # Chunk 1: Section A (section_type = None)
+        assert chunks[0].section_type is None
+        assert "Section A" in chunks[0].text
+        assert "Section B" not in chunks[0].text  # Heading B darf NICHT in Chunk 1 sein!
+        
+        # Chunk 2: Section B (section_type = insight) - MUSS die Überschrift enthalten!
+        assert chunks[1].section_type == "insight"
+        assert "Section B mit Typ" in chunks[1].text, "Überschrift 'Section B mit Typ' muss im insight-Chunk sein!"


 class TestBlockIdParsing: