Implement backward propagation of section_type in chunking parser
- Added a new function `_propagate_section_type_backwards` to ensure that the section_type is correctly assigned to all blocks within a heading section, even if the [!section] callout appears later in the text. - Updated the `parse_blocks` function to call this new method, enhancing the accuracy of section-type assignments. - Modified chunking strategies to reflect the changes in section-type handling, simplifying logic related to section-type transitions. - Expanded unit tests to validate the backward propagation of section_type, ensuring comprehensive coverage of the new functionality.
This commit is contained in:
parent
553a41df57
commit
e86e9f2313
|
|
@ -273,9 +273,66 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
section_type=current_section_type,
|
section_type=current_section_type,
|
||||||
block_id=current_block_id
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# WP-26 v1.3: Post-Processing - Section-Type rückwirkend setzen
|
||||||
|
# Der [!section] Callout kann IRGENDWO im Abschnitt stehen und gilt rückwirkend
|
||||||
|
# für die gesamte Heading-Sektion (vom Heading bis zum nächsten Heading gleicher/höherer Ebene)
|
||||||
|
blocks = _propagate_section_type_backwards(blocks, split_level=2)
|
||||||
|
|
||||||
return blocks, h1_title
|
return blocks, h1_title
|
||||||
|
|
||||||
|
|
||||||
|
def _propagate_section_type_backwards(blocks: List[RawBlock], split_level: int = 2) -> List[RawBlock]:
|
||||||
|
"""
|
||||||
|
WP-26 v1.3: Propagiert section_type rückwirkend für Heading-Sektionen.
|
||||||
|
|
||||||
|
Der [!section] Callout kann irgendwo im Abschnitt stehen (nicht nur direkt nach dem Heading).
|
||||||
|
Diese Funktion findet den section_type innerhalb einer Heading-Sektion und setzt ihn
|
||||||
|
rückwirkend für ALLE Blöcke dieser Sektion (inklusive dem Heading selbst).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blocks: Liste von RawBlock-Objekten
|
||||||
|
split_level: Heading-Ebene, die eine neue Sektion startet (Standard: 2 für H2)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste von RawBlock-Objekten mit korrigiertem section_type
|
||||||
|
"""
|
||||||
|
if not blocks:
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
# Gruppiere Blöcke nach Heading-Sektionen
|
||||||
|
sections: List[List[int]] = [] # Liste von Index-Listen
|
||||||
|
current_section_indices: List[int] = []
|
||||||
|
|
||||||
|
for idx, block in enumerate(blocks):
|
||||||
|
if block.kind == "heading" and block.level is not None and block.level <= split_level:
|
||||||
|
# Neues Heading startet neue Sektion
|
||||||
|
if current_section_indices:
|
||||||
|
sections.append(current_section_indices)
|
||||||
|
current_section_indices = [idx]
|
||||||
|
else:
|
||||||
|
current_section_indices.append(idx)
|
||||||
|
|
||||||
|
# Letzte Sektion hinzufügen
|
||||||
|
if current_section_indices:
|
||||||
|
sections.append(current_section_indices)
|
||||||
|
|
||||||
|
# Für jede Sektion: Finde den section_type und setze ihn rückwirkend
|
||||||
|
for section_indices in sections:
|
||||||
|
# Finde den section_type innerhalb dieser Sektion
|
||||||
|
section_type_found = None
|
||||||
|
for idx in section_indices:
|
||||||
|
if blocks[idx].section_type:
|
||||||
|
section_type_found = blocks[idx].section_type
|
||||||
|
break # Erster gefundener section_type gewinnt
|
||||||
|
|
||||||
|
# Wenn ein section_type gefunden wurde, setze ihn für alle Blöcke der Sektion
|
||||||
|
if section_type_found:
|
||||||
|
for idx in section_indices:
|
||||||
|
blocks[idx].section_type = section_type_found
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
|
def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
|
Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,9 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
|
||||||
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
||||||
WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
|
WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
|
||||||
WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER einen neuen Chunk (unabhängig vom Profil).
|
WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER einen neuen Chunk (unabhängig vom Profil).
|
||||||
Gleiche section_types folgen dem normalen Chunking-Verhalten.
|
WP-26 v1.3: Parser propagiert section_type rückwirkend für Heading-Sektionen.
|
||||||
|
Der [!section] Callout kann irgendwo im Abschnitt stehen.
|
||||||
|
Alle Blöcke einer Heading-Sektion haben den korrekten section_type.
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from .chunking_models import RawBlock, Chunk
|
from .chunking_models import RawBlock, Chunk
|
||||||
|
|
@ -54,26 +56,20 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
))
|
))
|
||||||
|
|
||||||
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
||||||
# WP-26 v1.0: Erweitert um section_type und block_id Tracking
|
# WP-26 v1.3: Der Parser propagiert section_type bereits rückwirkend für Heading-Sektionen.
|
||||||
# WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER eine neue Sektion (unabhängig vom Profil)
|
# Alle Blöcke einer Heading-Sektion (inkl. Heading selbst) haben bereits den korrekten section_type.
|
||||||
sections: List[Dict[str, Any]] = []
|
sections: List[Dict[str, Any]] = []
|
||||||
curr_blocks = []
|
curr_blocks = []
|
||||||
current_section_type = None # WP-26 v1.1: Tracking des aktuellen section_type
|
|
||||||
|
|
||||||
def _flush_section():
|
def _flush_section():
|
||||||
"""Hilfsfunktion zum Abschließen einer Sektion."""
|
"""Hilfsfunktion zum Abschließen einer Sektion."""
|
||||||
nonlocal curr_blocks, current_section_type
|
nonlocal curr_blocks
|
||||||
if not curr_blocks:
|
if not curr_blocks:
|
||||||
return
|
return
|
||||||
# WP-26 v1.0: Finde den effektiven section_type und block_id für diese Sektion
|
# WP-26 v1.3: section_type wird vom Parser bereits korrekt gesetzt (rückwirkend)
|
||||||
# Priorisiere den ersten Block mit section_type, sonst den Heading-Block
|
# Alle Blöcke einer Heading-Sektion haben denselben section_type
|
||||||
effective_section_type = None
|
effective_section_type = next((cb.section_type for cb in curr_blocks if cb.section_type), None)
|
||||||
effective_block_id = None
|
effective_block_id = next((cb.block_id for cb in curr_blocks if cb.block_id), None)
|
||||||
for cb in curr_blocks:
|
|
||||||
if cb.section_type and effective_section_type is None:
|
|
||||||
effective_section_type = cb.section_type
|
|
||||||
if cb.block_id and effective_block_id is None:
|
|
||||||
effective_block_id = cb.block_id
|
|
||||||
|
|
||||||
sections.append({
|
sections.append({
|
||||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||||
|
|
@ -83,39 +79,15 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
"block_id": effective_block_id
|
"block_id": effective_block_id
|
||||||
})
|
})
|
||||||
curr_blocks = []
|
curr_blocks = []
|
||||||
current_section_type = effective_section_type
|
|
||||||
|
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
# WP-26 v1.1: Prüfe auf Section-Type-Wechsel BEVOR wir den Block hinzufügen
|
|
||||||
# Wenn sich der section_type ändert, muss die aktuelle Sektion abgeschlossen werden
|
|
||||||
block_section_type = b.section_type
|
|
||||||
|
|
||||||
# Section-Type-Wechsel erkannt?
|
|
||||||
# (Wechsel ist: alter Typ != neuer Typ UND mindestens einer ist nicht None)
|
|
||||||
is_section_type_change = (
|
|
||||||
curr_blocks and # Es gibt bereits Blöcke
|
|
||||||
block_section_type is not None and # Neuer Block hat expliziten section_type
|
|
||||||
current_section_type != block_section_type # Typ hat sich geändert
|
|
||||||
)
|
|
||||||
|
|
||||||
if b.kind == "heading" and b.level <= split_level:
|
if b.kind == "heading" and b.level <= split_level:
|
||||||
# Heading-basierter Split (Standard-Verhalten)
|
# Heading-basierter Split
|
||||||
_flush_section()
|
_flush_section()
|
||||||
curr_blocks = [b]
|
curr_blocks = [b]
|
||||||
current_section_type = block_section_type # Update tracking
|
|
||||||
elif is_section_type_change:
|
|
||||||
# WP-26 v1.1: Section-Type-Wechsel erzwingt neuen Chunk
|
|
||||||
_flush_section()
|
|
||||||
curr_blocks = [b]
|
|
||||||
current_section_type = block_section_type
|
|
||||||
else:
|
else:
|
||||||
# Normales Hinzufügen zum aktuellen Block
|
|
||||||
curr_blocks.append(b)
|
curr_blocks.append(b)
|
||||||
# Update section_type wenn Block einen hat und wir noch keinen haben
|
|
||||||
if block_section_type and not current_section_type:
|
|
||||||
current_section_type = block_section_type
|
|
||||||
|
|
||||||
# Letzte Sektion abschließen
|
|
||||||
_flush_section()
|
_flush_section()
|
||||||
|
|
||||||
# --- SCHRITT 2: Verarbeitung der Queue ---
|
# --- SCHRITT 2: Verarbeitung der Queue ---
|
||||||
|
|
@ -269,11 +241,11 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
||||||
"""
|
"""
|
||||||
Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
|
Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
|
||||||
WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
|
WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
|
||||||
WP-26 v1.1: Section-Type-Wechsel erzwingt IMMER einen neuen Chunk.
|
WP-26 v1.3: Parser propagiert section_type rückwirkend - vereinfachte Logik.
|
||||||
"""
|
"""
|
||||||
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||||
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
||||||
current_section_type = None # WP-26 v1.1: Tracking des aktuellen section_type
|
current_section_type = None # Tracking des aktuellen section_type
|
||||||
|
|
||||||
def _flush_buffer():
|
def _flush_buffer():
|
||||||
"""Hilfsfunktion zum Flushen des Buffers."""
|
"""Hilfsfunktion zum Flushen des Buffers."""
|
||||||
|
|
@ -282,7 +254,7 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
||||||
return
|
return
|
||||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||||
# WP-26 v1.0: Finde effektiven section_type und block_id
|
# WP-26 v1.3: section_type wird bereits vom Parser rückwirkend propagiert
|
||||||
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
|
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
|
||||||
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
|
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
|
||||||
chunks.append(Chunk(
|
chunks.append(Chunk(
|
||||||
|
|
@ -300,10 +272,11 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
||||||
curr_tokens = sum(estimate_tokens(x.text) for x in buf) if buf else 0
|
curr_tokens = sum(estimate_tokens(x.text) for x in buf) if buf else 0
|
||||||
block_section_type = b.section_type
|
block_section_type = b.section_type
|
||||||
|
|
||||||
# WP-26 v1.1: Prüfe auf Section-Type-Wechsel
|
# WP-26 v1.3: Prüfe auf Section-Type-Wechsel
|
||||||
|
# Da der Parser section_type rückwirkend setzt, haben alle Blöcke einer
|
||||||
|
# Heading-Sektion denselben section_type. Ein Wechsel bedeutet neue Sektion.
|
||||||
is_section_type_change = (
|
is_section_type_change = (
|
||||||
buf and # Es gibt bereits Blöcke im Buffer
|
buf and # Es gibt bereits Blöcke im Buffer
|
||||||
block_section_type is not None and # Neuer Block hat expliziten section_type
|
|
||||||
current_section_type != block_section_type # Typ hat sich geändert
|
current_section_type != block_section_type # Typ hat sich geändert
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -312,8 +285,8 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
||||||
_flush_buffer()
|
_flush_buffer()
|
||||||
|
|
||||||
buf.append(b)
|
buf.append(b)
|
||||||
# Update section_type wenn Block einen hat und wir noch keinen haben
|
# Update section_type
|
||||||
if block_section_type and not current_section_type:
|
if block_section_type:
|
||||||
current_section_type = block_section_type
|
current_section_type = block_section_type
|
||||||
|
|
||||||
# Letzten Buffer flushen
|
# Letzten Buffer flushen
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,8 @@
|
||||||
FILE: tests/test_wp26_section_types.py
|
FILE: tests/test_wp26_section_types.py
|
||||||
DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
|
DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
|
||||||
WP-26 v1.1: Erweitert um Tests für Section-Split und automatische Edges
|
WP-26 v1.1: Erweitert um Tests für Section-Split und automatische Edges
|
||||||
VERSION: 1.1.0
|
WP-26 v1.3: Erweitert um Tests für rückwirkende section_type Propagation
|
||||||
|
VERSION: 1.3.0
|
||||||
"""
|
"""
|
||||||
import pytest
|
import pytest
|
||||||
from app.core.chunking.chunking_parser import parse_blocks
|
from app.core.chunking.chunking_parser import parse_blocks
|
||||||
|
|
@ -50,6 +51,36 @@ Die Geschichte beginnt hier.
|
||||||
|
|
||||||
# Block-ID sollte "sit" sein
|
# Block-ID sollte "sit" sein
|
||||||
assert heading_blocks[0].block_id == "sit"
|
assert heading_blocks[0].block_id == "sit"
|
||||||
|
|
||||||
|
def test_section_type_propagated_backwards_to_heading(self):
|
||||||
|
"""WP-26 v1.3: section_type wird rückwirkend auf das Heading propagiert."""
|
||||||
|
md = """
|
||||||
|
## Lektion ^learning
|
||||||
|
|
||||||
|
Einleitender Text ohne section callout.
|
||||||
|
|
||||||
|
Noch mehr Text hier...
|
||||||
|
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Und dann kommt der eigentliche Insight-Content.
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Finde den Heading-Block
|
||||||
|
heading_blocks = [b for b in blocks if b.kind == "heading"]
|
||||||
|
assert len(heading_blocks) >= 1
|
||||||
|
|
||||||
|
# WP-26 v1.3: Das Heading sollte section_type "insight" haben
|
||||||
|
# (rückwirkend propagiert, obwohl [!section] später im Abschnitt steht)
|
||||||
|
assert heading_blocks[0].section_type == "insight", \
|
||||||
|
f"Heading sollte section_type 'insight' haben, hat aber: {heading_blocks[0].section_type}"
|
||||||
|
|
||||||
|
# Alle Paragraphen in dieser Sektion sollten auch section_type "insight" haben
|
||||||
|
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||||
|
for p in paragraphs:
|
||||||
|
assert p.section_type == "insight", \
|
||||||
|
f"Paragraph sollte section_type 'insight' haben: {p.text[:50]}"
|
||||||
|
|
||||||
|
|
||||||
class TestSectionTypeScope:
|
class TestSectionTypeScope:
|
||||||
|
|
@ -475,8 +506,10 @@ Meine positive Einstellung hat mir geholfen.
|
||||||
# Mindestens ein Chunk mit section_type "insight"
|
# Mindestens ein Chunk mit section_type "insight"
|
||||||
assert len(insight_chunks) >= 1
|
assert len(insight_chunks) >= 1
|
||||||
|
|
||||||
# Der insight-Chunk sollte den Lektions-Inhalt enthalten
|
# WP-26 v1.2: Der insight-Chunk MUSS die Überschrift "💡 Lektion" enthalten!
|
||||||
|
# (Nicht nur den Inhalt nach dem [!section] Callout)
|
||||||
insight_text = insight_chunks[0].text
|
insight_text = insight_chunks[0].text
|
||||||
|
assert "Lektion" in insight_text, f"Überschrift '💡 Lektion' fehlt im insight-Chunk: {insight_text[:100]}"
|
||||||
assert "durchdenken" in insight_text.lower() or "positive" in insight_text.lower()
|
assert "durchdenken" in insight_text.lower() or "positive" in insight_text.lower()
|
||||||
|
|
||||||
def test_section_type_change_in_smart_mode_forces_split(self):
|
def test_section_type_change_in_smart_mode_forces_split(self):
|
||||||
|
|
@ -516,6 +549,42 @@ Inhalt C mit section_type "insight".
|
||||||
# Der letzte Chunk sollte section_type "insight" haben
|
# Der letzte Chunk sollte section_type "insight" haben
|
||||||
insight_chunks = [c for c in chunks if c.section_type == "insight"]
|
insight_chunks = [c for c in chunks if c.section_type == "insight"]
|
||||||
assert len(insight_chunks) >= 1, "Kein Chunk mit section_type 'insight' gefunden"
|
assert len(insight_chunks) >= 1, "Kein Chunk mit section_type 'insight' gefunden"
|
||||||
|
|
||||||
|
def test_heading_belongs_to_new_section_with_section_type(self):
|
||||||
|
"""WP-26 v1.2: Heading gehört zur neuen Sektion wenn [!section] folgt"""
|
||||||
|
md = """
|
||||||
|
## Section A
|
||||||
|
|
||||||
|
Inhalt ohne section_type.
|
||||||
|
|
||||||
|
## Section B mit Typ
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Inhalt mit section_type.
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"target": 2000,
|
||||||
|
"max": 4000,
|
||||||
|
"split_level": 2,
|
||||||
|
"strict_heading_split": False,
|
||||||
|
"enable_smart_edge_allocation": True
|
||||||
|
}
|
||||||
|
|
||||||
|
chunks = strategy_by_heading(blocks, config, "test-note")
|
||||||
|
|
||||||
|
# Es sollten 2 Chunks geben
|
||||||
|
assert len(chunks) == 2, f"Erwartet 2 Chunks, bekommen: {len(chunks)}"
|
||||||
|
|
||||||
|
# Chunk 1: Section A (section_type = None)
|
||||||
|
assert chunks[0].section_type is None
|
||||||
|
assert "Section A" in chunks[0].text
|
||||||
|
assert "Section B" not in chunks[0].text # Heading B darf NICHT in Chunk 1 sein!
|
||||||
|
|
||||||
|
# Chunk 2: Section B (section_type = insight) - MUSS die Überschrift enthalten!
|
||||||
|
assert chunks[1].section_type == "insight"
|
||||||
|
assert "Section B mit Typ" in chunks[1].text, "Überschrift 'Section B mit Typ' muss im insight-Chunk sein!"
|
||||||
|
|
||||||
|
|
||||||
class TestBlockIdParsing:
|
class TestBlockIdParsing:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user