Enhance chunking strategies and graph utilities for section-type transitions and block ID extraction
- Implemented WP-26 v1.1: Section-Type-Wechsel erzwingt Split auch in SMART MODE (Schritt 2) zur Verbesserung der Chunking-Logik. - Updated `parse_link_target` to extract block IDs from section strings, ensuring accurate handling of links with block references. - Added unit tests to validate section-type change behavior and block ID extraction functionality, enhancing overall reliability.
This commit is contained in:
parent
af3cc0a254
commit
52fdc425f7
|
|
@ -191,6 +191,36 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# FALL B: SMART MODE (Regel 1-3)
|
# FALL B: SMART MODE (Regel 1-3)
|
||||||
|
# WP-26 v1.1: Prüfe auf Section-Type-Wechsel AUCH in Schritt 2
|
||||||
|
# Wenn sich der section_type zwischen current_meta und item ändert, muss gesplittet werden
|
||||||
|
item_section_type = item.get("section_type")
|
||||||
|
current_section_type_meta = current_meta.get("section_type")
|
||||||
|
|
||||||
|
# Section-Type-Wechsel: Von None zu einem Typ ODER von einem Typ zu einem anderen
|
||||||
|
is_section_type_change_step2 = (
|
||||||
|
current_chunk_text and # Es gibt bereits Content
|
||||||
|
(
|
||||||
|
# Wechsel von None zu einem Typ
|
||||||
|
(current_section_type_meta is None and item_section_type is not None) or
|
||||||
|
# Wechsel von einem Typ zu None
|
||||||
|
(current_section_type_meta is not None and item_section_type is None) or
|
||||||
|
# Wechsel zwischen verschiedenen Typen
|
||||||
|
(current_section_type_meta is not None and item_section_type is not None
|
||||||
|
and current_section_type_meta != item_section_type)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_section_type_change_step2:
|
||||||
|
# WP-26 v1.1: Section-Type-Wechsel erzwingt Split
|
||||||
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
|
current_chunk_text = ""
|
||||||
|
# Reset Meta für nächsten Chunk
|
||||||
|
current_meta["title"] = item["meta"].section_title
|
||||||
|
current_meta["path"] = item["meta"].section_path
|
||||||
|
current_meta["section_type"] = item_section_type
|
||||||
|
current_meta["block_id"] = item.get("block_id")
|
||||||
|
|
||||||
combined_text = (current_chunk_text + "\n\n" + item_text).strip() if current_chunk_text else item_text
|
combined_text = (current_chunk_text + "\n\n" + item_text).strip() if current_chunk_text else item_text
|
||||||
combined_est = estimate_tokens(combined_text)
|
combined_est = estimate_tokens(combined_text)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -131,6 +131,12 @@ def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[
|
||||||
Trennt einen Obsidian-Link [[Target#Section]] in seine Bestandteile Target und Section.
|
Trennt einen Obsidian-Link [[Target#Section]] in seine Bestandteile Target und Section.
|
||||||
Behandelt Self-Links (z.B. [[#Ziele]]), indem die aktuelle note_id eingesetzt wird.
|
Behandelt Self-Links (z.B. [[#Ziele]]), indem die aktuelle note_id eingesetzt wird.
|
||||||
|
|
||||||
|
WP-26 v1.1: Extrahiert Block-ID aus Section-Strings.
|
||||||
|
- Wenn Section "^block-id" enthält, wird nur der Block-ID-Teil extrahiert
|
||||||
|
- Beispiel: "📖 Diagnose: Glioblastom ^kontext" -> section = "kontext"
|
||||||
|
- Beispiel: "^learning" -> section = "learning"
|
||||||
|
- Beispiel: " ^sit" (nur Block-ID) -> section = "sit"
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple (target_id, target_section)
|
Tuple (target_id, target_section)
|
||||||
"""
|
"""
|
||||||
|
|
@ -141,6 +147,16 @@ def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[
|
||||||
target = parts[0].strip()
|
target = parts[0].strip()
|
||||||
section = parts[1].strip() if len(parts) > 1 else None
|
section = parts[1].strip() if len(parts) > 1 else None
|
||||||
|
|
||||||
|
# WP-26 v1.1: Block-ID-Extraktion aus Section
|
||||||
|
# Wenn die Section ein "^" enthält, extrahiere nur den Block-ID-Teil
|
||||||
|
if section and "^" in section:
|
||||||
|
# Finde den ^block-id Teil
|
||||||
|
import re
|
||||||
|
block_id_match = re.search(r'\^([a-zA-Z0-9_-]+)', section)
|
||||||
|
if block_id_match:
|
||||||
|
# Ersetze die gesamte Section durch nur die Block-ID
|
||||||
|
section = block_id_match.group(1)
|
||||||
|
|
||||||
# Spezialfall: Self-Link innerhalb derselben Datei
|
# Spezialfall: Self-Link innerhalb derselben Datei
|
||||||
if not target and section and current_note_id:
|
if not target and section and current_note_id:
|
||||||
target = current_note_id
|
target = current_note_id
|
||||||
|
|
|
||||||
|
|
@ -478,6 +478,81 @@ Meine positive Einstellung hat mir geholfen.
|
||||||
insight_text = insight_chunks[0].text
|
insight_text = insight_chunks[0].text
|
||||||
assert "durchdenken" in insight_text.lower() or "positive" in insight_text.lower()
|
assert "durchdenken" in insight_text.lower() or "positive" in insight_text.lower()
|
||||||
|
|
||||||
|
def test_section_type_change_in_smart_mode_forces_split(self):
|
||||||
|
"""WP-26 v1.1 Fix: Section-Type-Wechsel erzwingt Split auch in SMART MODE (Schritt 2)"""
|
||||||
|
md = """
|
||||||
|
## Section A ohne Typ
|
||||||
|
|
||||||
|
Inhalt A ohne section_type.
|
||||||
|
|
||||||
|
## Section B ohne Typ
|
||||||
|
|
||||||
|
Inhalt B ohne section_type.
|
||||||
|
|
||||||
|
## Section C mit Typ
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Inhalt C mit section_type "insight".
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# SMART MODE: strict=False, smart_edge=True
|
||||||
|
# Token-Limit hoch genug, dass alles zusammengefasst werden KÖNNTE
|
||||||
|
config = {
|
||||||
|
"target": 2000,
|
||||||
|
"max": 4000,
|
||||||
|
"split_level": 2,
|
||||||
|
"strict_heading_split": False,
|
||||||
|
"enable_smart_edge_allocation": True
|
||||||
|
}
|
||||||
|
|
||||||
|
chunks = strategy_by_heading(blocks, config, "test-note")
|
||||||
|
|
||||||
|
# Trotz hohem Token-Limit sollte Section C ein separater Chunk sein
|
||||||
|
# wegen Section-Type-Wechsel (None -> insight)
|
||||||
|
assert len(chunks) >= 2, f"Erwartet mindestens 2 Chunks, bekommen: {len(chunks)}"
|
||||||
|
|
||||||
|
# Der letzte Chunk sollte section_type "insight" haben
|
||||||
|
insight_chunks = [c for c in chunks if c.section_type == "insight"]
|
||||||
|
assert len(insight_chunks) >= 1, "Kein Chunk mit section_type 'insight' gefunden"
|
||||||
|
|
||||||
|
|
||||||
|
class TestBlockIdParsing:
|
||||||
|
"""UT-18: Block-ID-Extraktion aus Section-Referenzen"""
|
||||||
|
|
||||||
|
def test_block_id_extraction_from_section(self):
|
||||||
|
"""Block-ID wird aus Section-String extrahiert"""
|
||||||
|
from app.core.graph.graph_utils import parse_link_target
|
||||||
|
|
||||||
|
# Test: Überschrift mit Block-ID
|
||||||
|
target, section = parse_link_target("#📖 Diagnose: Glioblastom ^kontext", "note1")
|
||||||
|
assert target == "note1" # Self-Link
|
||||||
|
assert section == "kontext", f"Erwartet 'kontext', bekommen: {section}"
|
||||||
|
|
||||||
|
def test_block_id_extraction_only_caret(self):
|
||||||
|
"""Nur Block-ID mit ^"""
|
||||||
|
from app.core.graph.graph_utils import parse_link_target
|
||||||
|
|
||||||
|
target, section = parse_link_target("#^learning", "note1")
|
||||||
|
assert target == "note1"
|
||||||
|
assert section == "learning"
|
||||||
|
|
||||||
|
def test_block_id_extraction_with_spaces(self):
|
||||||
|
"""Block-ID mit Text davor"""
|
||||||
|
from app.core.graph.graph_utils import parse_link_target
|
||||||
|
|
||||||
|
target, section = parse_link_target("OtherNote#🎭 Emotions-Check ^emotionen", None)
|
||||||
|
assert target == "OtherNote"
|
||||||
|
assert section == "emotionen"
|
||||||
|
|
||||||
|
def test_section_without_block_id(self):
|
||||||
|
"""Section ohne Block-ID bleibt unverändert"""
|
||||||
|
from app.core.graph.graph_utils import parse_link_target
|
||||||
|
|
||||||
|
target, section = parse_link_target("Note#Normale Überschrift", None)
|
||||||
|
assert target == "Note"
|
||||||
|
assert section == "Normale Überschrift"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pytest.main([__file__, "-v"])
|
pytest.main([__file__, "-v"])
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user