Refactor provenance handling in EdgeDTO and graph utilities

- Updated provenance priorities and introduced a mapping from internal provenance values to EdgeDTO-compliant literals.
- Added a new function `normalize_provenance` to standardize internal provenance strings.
- Enhanced the `_edge` function to include an `is_internal` flag and provenance normalization.
- Modified the `EdgeDTO` model to include a new `source_hint` field for detailed provenance information and an `is_internal` flag for intra-note edges.
- Reduced the provenance options in `EdgeDTO` to valid literals, improving data integrity.
This commit is contained in:
Lars 2026-01-25 16:27:09 +01:00
parent 0d61a9e191
commit cc258008dc
9 changed files with 2337 additions and 52 deletions

View File

@ -1,13 +1,17 @@
""" """
FILE: app/core/chunking/chunking_models.py FILE: app/core/chunking/chunking_models.py
DESCRIPTION: Datenklassen für das Chunking-System. DESCRIPTION: Datenklassen für das Chunking-System.
WP-26 v1.0: Erweiterung um section_type für typ-spezifische Sektionen.
""" """
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any from typing import List, Dict, Optional, Any
@dataclass @dataclass
class RawBlock: class RawBlock:
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing.""" """
Repräsentiert einen logischen Block aus dem Markdown-Parsing.
WP-26 v1.0: Erweitert um section_type für typ-spezifische Sektionen.
"""
kind: str kind: str
text: str text: str
level: Optional[int] level: Optional[int]
@ -15,10 +19,17 @@ class RawBlock:
section_title: Optional[str] section_title: Optional[str]
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
section_type: Optional[str] = None # z.B. "insight", "decision", "experience"
# WP-26 v1.0: Block-ID für Intra-Note-Links (z.B. "^sit" aus "## Situation ^sit")
block_id: Optional[str] = None
@dataclass @dataclass
class Chunk: class Chunk:
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung.""" """
Das finale Chunk-Objekt für Embedding und Graph-Speicherung.
WP-26 v1.0: Erweitert um section_type für effektiven Typ.
"""
id: str id: str
note_id: str note_id: str
index: int index: int
@ -31,3 +42,8 @@ class Chunk:
neighbors_next: Optional[str] neighbors_next: Optional[str]
candidate_pool: List[Dict[str, Any]] = field(default_factory=list) candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
suggested_edges: Optional[List[str]] = None suggested_edges: Optional[List[str]] = None
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
# Wenn gesetzt, wird dieser als "effektiver Typ" verwendet statt note_type
section_type: Optional[str] = None
# WP-26 v1.0: Block-ID für Intra-Note-Links
block_id: Optional[str] = None

View File

@ -5,16 +5,28 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
Stellt die Funktion parse_edges_robust zur Verfügung. Stellt die Funktion parse_edges_robust zur Verfügung.
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss. WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen. WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
""" """
import re import re
import os import os
import logging
from typing import List, Tuple, Set, Dict, Any, Optional from typing import List, Tuple, Set, Dict, Any, Optional
from .chunking_models import RawBlock from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text from .chunking_utils import extract_frontmatter_from_text
logger = logging.getLogger(__name__)
_WS = re.compile(r'\s+') _WS = re.compile(r'\s+')
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])') _SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
# WP-26 v1.0: Pattern für [!section]-Callouts
# Matches: > [!section] type-name
_SECTION_CALLOUT_PATTERN = re.compile(r'^\s*>\s*\[!section\]\s*(\w+)', re.IGNORECASE)
# WP-26 v1.0: Pattern für Block-IDs in Überschriften
# Matches: ## Titel ^block-id
_BLOCK_ID_PATTERN = re.compile(r'\^([a-zA-Z0-9_-]+)\s*$')
def split_sentences(text: str) -> list[str]: def split_sentences(text: str) -> list[str]:
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion.""" """Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
text = _WS.sub(' ', text.strip()) text = _WS.sub(' ', text.strip())
@ -27,12 +39,18 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6. Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss. WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt). WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
""" """
blocks = [] blocks = []
h1_title = "Dokument" h1_title = "Dokument"
section_path = "/" section_path = "/"
current_section_title = None current_section_title = None
# WP-26 v1.0: State-Machine für Section-Type-Tracking
current_section_type: Optional[str] = None # Aktueller Section-Type (oder None für note_type Fallback)
section_introduced_at_level: Optional[int] = None # Ebene, auf der erste Section eingeführt wurde
current_block_id: Optional[str] = None # Block-ID der aktuellen Sektion
# Frontmatter entfernen # Frontmatter entfernen
fm, text_without_fm = extract_frontmatter_from_text(md_text) fm, text_without_fm = extract_frontmatter_from_text(md_text)
@ -70,8 +88,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
buffer = [] buffer = []
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>) # WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
# Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >) # WP-26 v1.0: Erweitert um [!section]-Callouts
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE) # Regex für Callouts: >\s*[!edge], >\s*[!abstract], >\s*[!section] (auch mit mehreren >)
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract|section)\]', re.IGNORECASE)
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen # WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
processed_indices = set() processed_indices = set()
@ -86,13 +105,39 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
# Prüfe, ob diese Zeile ein Callout startet # Prüfe, ob diese Zeile ein Callout startet
callout_match = callout_pattern.match(line) callout_match = callout_pattern.match(line)
if callout_match: if callout_match:
callout_type = callout_match.group(1).lower() # "edge", "abstract", oder "section"
# WP-26 v1.0: [!section] Callout-Behandlung
if callout_type == "section":
# Extrahiere Section-Type aus dem Callout
section_match = _SECTION_CALLOUT_PATTERN.match(line)
if section_match:
new_section_type = section_match.group(1).lower()
current_section_type = new_section_type
# Tracke die Ebene, auf der die erste Section eingeführt wurde
# Wir nehmen die Ebene der letzten Überschrift (section_path basiert)
if section_introduced_at_level is None:
# Bestimme Ebene aus section_path
# "/" = H1, "/Title" = H2, "/Title/Sub" = H3, etc.
path_depth = section_path.count('/') if section_path else 1
section_introduced_at_level = max(1, path_depth + 1)
logger.debug(f"WP-26: Section-Type erkannt: '{new_section_type}' bei '{current_section_title}' (Level: {section_introduced_at_level})")
# [!section] Callout wird nicht als Block hinzugefügt (ist nur Metadaten)
processed_indices.add(i)
continue
# Vorherigen Text-Block abschließen # Vorherigen Text-Block abschließen
if buffer: if buffer:
content = "\n".join(buffer).strip() content = "\n".join(buffer).strip()
if content: if content:
blocks.append(RawBlock( blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title, "paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
)) ))
buffer = [] buffer = []
@ -120,7 +165,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
blocks.append(RawBlock( blocks.append(RawBlock(
"callout", callout_content, None, section_path, current_section_title, "callout", callout_content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
is_meta_content=True # WP-24c v4.2.6: Markierung für spätere Entfernung is_meta_content=True, # WP-24c v4.2.6: Markierung für spätere Entfernung
section_type=current_section_type,
block_id=current_block_id
)) ))
continue continue
@ -133,13 +180,32 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
if content: if content:
blocks.append(RawBlock( blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title, "paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
)) ))
buffer = [] buffer = []
level = len(heading_match.group(1)) level = len(heading_match.group(1))
title = heading_match.group(2).strip() title = heading_match.group(2).strip()
# WP-26 v1.0: Block-ID aus Überschrift extrahieren (z.B. "## Titel ^block-id")
block_id_match = _BLOCK_ID_PATTERN.search(title)
if block_id_match:
current_block_id = block_id_match.group(1)
# Entferne Block-ID aus dem Titel für saubere Anzeige
title = _BLOCK_ID_PATTERN.sub('', title).strip()
else:
current_block_id = None
# WP-26 v1.0: Section-Type State-Machine
# Wenn eine Section eingeführt wurde und wir auf gleicher oder höherer Ebene sind:
# -> Automatisch neue Section erkennen (FA-02b)
if section_introduced_at_level is not None and level <= section_introduced_at_level:
# Neue Überschrift auf gleicher oder höherer Ebene -> Reset auf None (note_type Fallback)
current_section_type = None
logger.debug(f"WP-26: Neue Section erkannt bei H{level} '{title}' -> Reset auf note_type")
# WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet # WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
is_llm_validation_zone = ( is_llm_validation_zone = (
level == llm_validation_level and level == llm_validation_level and
@ -170,7 +236,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
# Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone) # Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
blocks.append(RawBlock( blocks.append(RawBlock(
"heading", stripped, level, section_path, current_section_title, "heading", stripped, level, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
)) ))
continue continue
@ -181,13 +249,17 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
if content: if content:
blocks.append(RawBlock( blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title, "paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
)) ))
buffer = [] buffer = []
if stripped == "---": if stripped == "---":
blocks.append(RawBlock( blocks.append(RawBlock(
"separator", "---", None, section_path, current_section_title, "separator", "---", None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
)) ))
else: else:
buffer.append(line) buffer.append(line)
@ -197,7 +269,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
if content: if content:
blocks.append(RawBlock( blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title, "paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
)) ))
return blocks, h1_title return blocks, h1_title

View File

@ -6,6 +6,7 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead. - Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix). - Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht. WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
""" """
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from .chunking_models import RawBlock, Chunk from .chunking_models import RawBlock, Chunk
@ -36,41 +37,70 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
chunks: List[Chunk] = [] chunks: List[Chunk] = []
def _emit(txt, title, path): def _emit(txt, title, path, section_type=None, block_id=None):
"""Schreibt den finalen Chunk ohne Text-Modifikationen.""" """
Schreibt den finalen Chunk ohne Text-Modifikationen.
WP-26 v1.0: Erweitert um section_type und block_id.
"""
idx = len(chunks) idx = len(chunks)
win = _create_win(context_prefix, title, txt) win = _create_win(context_prefix, title, txt)
chunks.append(Chunk( chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt), text=txt, window=win, token_count=estimate_tokens(txt),
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None,
section_type=section_type, block_id=block_id
)) ))
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten --- # --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
# WP-26 v1.0: Erweitert um section_type und block_id Tracking
sections: List[Dict[str, Any]] = [] sections: List[Dict[str, Any]] = []
curr_blocks = [] curr_blocks = []
for b in blocks: for b in blocks:
if b.kind == "heading" and b.level <= split_level: if b.kind == "heading" and b.level <= split_level:
if curr_blocks: if curr_blocks:
# WP-26 v1.0: Finde den effektiven section_type und block_id für diese Sektion
# Priorisiere den ersten Block mit section_type, sonst den Heading-Block
effective_section_type = None
effective_block_id = None
for cb in curr_blocks:
if cb.section_type and effective_section_type is None:
effective_section_type = cb.section_type
if cb.block_id and effective_block_id is None:
effective_block_id = cb.block_id
sections.append({ sections.append({
"text": "\n\n".join([x.text for x in curr_blocks]), "text": "\n\n".join([x.text for x in curr_blocks]),
"meta": curr_blocks[0], "meta": curr_blocks[0],
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading" "is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
"section_type": effective_section_type,
"block_id": effective_block_id
}) })
curr_blocks = [b] curr_blocks = [b]
else: else:
curr_blocks.append(b) curr_blocks.append(b)
if curr_blocks: if curr_blocks:
# WP-26 v1.0: Gleiche Logik für den letzten Block
effective_section_type = None
effective_block_id = None
for cb in curr_blocks:
if cb.section_type and effective_section_type is None:
effective_section_type = cb.section_type
if cb.block_id and effective_block_id is None:
effective_block_id = cb.block_id
sections.append({ sections.append({
"text": "\n\n".join([x.text for x in curr_blocks]), "text": "\n\n".join([x.text for x in curr_blocks]),
"meta": curr_blocks[0], "meta": curr_blocks[0],
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading" "is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
"section_type": effective_section_type,
"block_id": effective_block_id
}) })
# --- SCHRITT 2: Verarbeitung der Queue --- # --- SCHRITT 2: Verarbeitung der Queue ---
queue = list(sections) queue = list(sections)
current_chunk_text = "" current_chunk_text = ""
current_meta = {"title": None, "path": "/"} # WP-26 v1.0: Erweitert um section_type und block_id
current_meta = {"title": None, "path": "/", "section_type": None, "block_id": None}
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True # Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
is_hard_split_mode = (not smart_edge) or (strict) is_hard_split_mode = (not smart_edge) or (strict)
@ -83,6 +113,9 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
if not current_chunk_text: if not current_chunk_text:
current_meta["title"] = item["meta"].section_title current_meta["title"] = item["meta"].section_title
current_meta["path"] = item["meta"].section_path current_meta["path"] = item["meta"].section_path
# WP-26 v1.0: section_type und block_id aus Item übernehmen
current_meta["section_type"] = item.get("section_type")
current_meta["block_id"] = item.get("block_id")
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over) # FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
if is_hard_split_mode: if is_hard_split_mode:
@ -90,18 +123,23 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften # Kein Carry-Over erlaubt, auch nicht für leere Überschriften
if current_chunk_text: if current_chunk_text:
# Flashe vorherigen Chunk # Flashe vorherigen Chunk
_emit(current_chunk_text, current_meta["title"], current_meta["path"]) _emit(current_chunk_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
current_chunk_text = "" current_chunk_text = ""
# Neue Sektion: Initialisiere Meta # Neue Sektion: Initialisiere Meta
current_meta["title"] = item["meta"].section_title current_meta["title"] = item["meta"].section_title
current_meta["path"] = item["meta"].section_path current_meta["path"] = item["meta"].section_path
# WP-26 v1.0: section_type und block_id aus Item übernehmen
current_meta["section_type"] = item.get("section_type")
current_meta["block_id"] = item.get("block_id")
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt # WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
# (nur Überschrift, kein Inhalt) # (nur Überschrift, kein Inhalt)
if item.get("is_empty", False): if item.get("is_empty", False):
# Leere Sektion: Nur Überschrift als Chunk # Leere Sektion: Nur Überschrift als Chunk
_emit(item_text, current_meta["title"], current_meta["path"]) _emit(item_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
else: else:
# Normale Sektion: Prüfe auf Token-Limit # Normale Sektion: Prüfe auf Token-Limit
if estimate_tokens(item_text) > max_tokens: if estimate_tokens(item_text) > max_tokens:
@ -113,16 +151,19 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
while sents: while sents:
s = sents.pop(0); slen = estimate_tokens(s) s = sents.pop(0); slen = estimate_tokens(s)
if take_len + slen > target and take_sents: if take_len + slen > target and take_sents:
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"]) _emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
take_sents = [s]; take_len = slen take_sents = [s]; take_len = slen
else: else:
take_sents.append(s); take_len += slen take_sents.append(s); take_len += slen
if take_sents: if take_sents:
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"]) _emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
else: else:
# Sektion passt: Direkt als Chunk # Sektion passt: Direkt als Chunk
_emit(item_text, current_meta["title"], current_meta["path"]) _emit(item_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
current_chunk_text = "" current_chunk_text = ""
continue continue
@ -137,7 +178,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
else: else:
if current_chunk_text: if current_chunk_text:
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen # Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
_emit(current_chunk_text, current_meta["title"], current_meta["path"]) _emit(current_chunk_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
current_chunk_text = "" current_chunk_text = ""
queue.insert(0, item) queue.insert(0, item)
else: else:
@ -152,7 +194,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
sents.insert(0, s); break sents.insert(0, s); break
take_sents.append(s); take_len += slen take_sents.append(s); take_len += slen
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"]) _emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
if sents: if sents:
remainder = " ".join(sents) remainder = " ".join(sents)
@ -160,15 +203,21 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
if header_prefix and not remainder.startswith(header_prefix): if header_prefix and not remainder.startswith(header_prefix):
remainder = header_prefix + "\n\n" + remainder remainder = header_prefix + "\n\n" + remainder
# Carry-Over: Rest wird vorne in die Queue geschoben # Carry-Over: Rest wird vorne in die Queue geschoben
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True}) # WP-26 v1.0: section_type und block_id weitergeben
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True,
"section_type": item.get("section_type"), "block_id": item.get("block_id")})
if current_chunk_text: if current_chunk_text:
_emit(current_chunk_text, current_meta["title"], current_meta["path"]) _emit(current_chunk_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
return chunks return chunks
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]: def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus.""" """
Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
"""
target = config.get("target", 400); max_tokens = config.get("max", 600) target = config.get("target", 400); max_tokens = config.get("max", 600)
chunks: List[Chunk] = []; buf: List[RawBlock] = [] chunks: List[Chunk] = []; buf: List[RawBlock] = []
@ -178,13 +227,31 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
if curr_tokens + b_tokens > max_tokens and buf: if curr_tokens + b_tokens > max_tokens and buf:
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks) txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
win = _create_win(context_prefix, buf[0].section_title, txt) win = _create_win(context_prefix, buf[0].section_title, txt)
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=curr_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None)) # WP-26 v1.0: Finde effektiven section_type und block_id
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=curr_tokens,
section_title=buf[0].section_title, section_path=buf[0].section_path,
neighbors_prev=None, neighbors_next=None,
section_type=effective_section_type, block_id=effective_block_id
))
buf = [] buf = []
buf.append(b) buf.append(b)
if buf: if buf:
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks) txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
win = _create_win(context_prefix, buf[0].section_title, txt) win = _create_win(context_prefix, buf[0].section_title, txt)
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None)) # WP-26 v1.0: Finde effektiven section_type und block_id
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=buf[0].section_title, section_path=buf[0].section_path,
neighbors_prev=None, neighbors_next=None,
section_type=effective_section_type, block_id=effective_block_id
))
return chunks return chunks

View File

@ -12,28 +12,85 @@ STATUS: Active
import os import os
import uuid import uuid
import hashlib import hashlib
from typing import Iterable, List, Optional, Set, Any, Tuple from typing import Dict, Iterable, List, Optional, Set, Any, Tuple
try: try:
import yaml import yaml
except ImportError: except ImportError:
yaml = None yaml = None
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft # WP-26 v1.0: Provenance-Literale auf valide EdgeDTO-Werte reduziert
# Legacy-Prioritäten für interne Verarbeitung (werden zu source_hint gemappt)
PROVENANCE_PRIORITY = { PROVENANCE_PRIORITY = {
# Explizite Kanten (provenance: "explicit")
"explicit:wikilink": 1.00, "explicit:wikilink": 1.00,
"inline:rel": 0.95, "inline:rel": 0.95,
"callout:edge": 0.90, "callout:edge": 0.90,
"explicit:callout": 0.90, # WP-24c v4.2.7: Callout-Kanten aus candidate_pool "explicit:callout": 0.90,
"semantic_ai": 0.90, # Validierte KI-Kanten
"structure:belongs_to": 1.00,
"structure:order": 0.95, # next/prev
"explicit:note_scope": 1.00, "explicit:note_scope": 1.00,
"explicit:note_zone": 1.00, # WP-24c v4.2.0: Note-Scope Zonen (höchste Priorität) "explicit:note_zone": 1.00,
# Regel-basierte Kanten (provenance: "rule")
"derived:backlink": 0.90, "derived:backlink": 0.90,
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml "edge_defaults": 0.70,
"schema_default": 0.85,
# Struktur-Kanten (provenance: "structure")
"structure:belongs_to": 1.00,
"structure:order": 0.95,
# KI-generierte Kanten (provenance: "smart")
"semantic_ai": 0.90,
"global_pool": 0.80,
} }
# WP-26 v1.0: Mapping von internen Provenance-Werten zu EdgeDTO-konformen Literalen
PROVENANCE_TO_DTO = {
# explicit
"explicit:wikilink": ("explicit", "wikilink"),
"explicit:callout": ("explicit", "callout"),
"explicit:note_scope": ("explicit", "note_scope"),
"explicit:note_zone": ("explicit", "note_zone"),
"inline:rel": ("explicit", "inline_rel"),
"callout:edge": ("explicit", "callout"),
"explicit": ("explicit", None),
# rule
"derived:backlink": ("rule", "backlink"),
"edge_defaults": ("rule", "edge_defaults"),
"schema_default": ("rule", "schema_default"),
"inferred:schema": ("rule", "schema_default"),
"rule": ("rule", None),
# structure
"structure:belongs_to": ("structure", "belongs_to"),
"structure:order": ("structure", "order"),
"structure": ("structure", None),
# smart
"semantic_ai": ("smart", None),
"global_pool": ("smart", "global_pool"),
"smart": ("smart", None),
}
def normalize_provenance(internal_provenance: str) -> Tuple[str, Optional[str]]:
"""
WP-26 v1.0: Normalisiert interne Provenance-Werte zu EdgeDTO-konformen Literalen.
Args:
internal_provenance: Interner Provenance-String (z.B. "explicit:callout")
Returns:
Tuple (provenance, source_hint) mit validen EdgeDTO-Werten
"""
if internal_provenance in PROVENANCE_TO_DTO:
return PROVENANCE_TO_DTO[internal_provenance]
# Fallback: Versuche Präfix-Matching
if internal_provenance.startswith("explicit"):
return ("explicit", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
if internal_provenance.startswith("structure"):
return ("structure", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
if internal_provenance.startswith("rule") or internal_provenance.startswith("derived"):
return ("rule", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
# Default: explicit ohne source_hint
return ("explicit", None)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Pfad-Auflösung (Integration der .env Umgebungsvariablen) # Pfad-Auflösung (Integration der .env Umgebungsvariablen)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -123,7 +180,15 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
""" """
Konstruiert ein standardisiertes Kanten-Payload für Qdrant. Konstruiert ein standardisiertes Kanten-Payload für Qdrant.
Wird von graph_derive_edges.py benötigt. WP-26 v1.0: Erweitert um is_internal Flag und Provenance-Normalisierung.
Args:
kind: Kantentyp (z.B. "derives", "caused_by")
scope: Granularität ("chunk" oder "note")
source_id: ID der Quelle (Chunk oder Note)
target_id: ID des Ziels (Chunk oder Note)
note_id: ID der Note (für Kontext)
extra: Zusätzliche Payload-Felder
""" """
pl = { pl = {
"kind": kind, "kind": kind,
@ -134,8 +199,24 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
"note_id": note_id, "note_id": note_id,
"virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt "virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt
} }
# WP-26 v1.0: is_internal Flag berechnen
# Intra-Note-Edge: Source und Target gehören zur gleichen Note
source_note = source_id.split("#")[0] if "#" in source_id else source_id
target_note = target_id.split("#")[0] if "#" in target_id else target_id
pl["is_internal"] = (source_note == target_note) or (source_note == note_id and target_note == note_id)
if extra: if extra:
pl.update(extra) pl.update(extra)
# WP-26 v1.0: Provenance normalisieren, falls vorhanden
if "provenance" in extra:
internal_prov = extra["provenance"]
dto_prov, source_hint = normalize_provenance(internal_prov)
pl["provenance"] = dto_prov
if source_hint:
pl["source_hint"] = source_hint
return pl return pl
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@ -3,7 +3,8 @@ FILE: app/core/ingestion/ingestion_chunk_payload.py
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults. Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution. WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution.
VERSION: 2.4.4 (WP-24c v4.3.0) WP-26 v1.0: Erweiterung um effective_type (section_type || note_type) und note_type-Feld.
VERSION: 2.5.0 (WP-26 v1.0)
STATUS: Active STATUS: Active
""" """
from __future__ import annotations from __future__ import annotations
@ -92,13 +93,34 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
# WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution # WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution
candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", []) candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", [])
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
section_type = getattr(ch, "section_type", None) if not is_dict else ch.get("section_type")
# WP-26 v1.0: Block-ID für Intra-Note-Links
block_id = getattr(ch, "block_id", None) if not is_dict else ch.get("block_id")
# WP-26 v1.0: Effektiver Typ = section_type || note_type (FA-03)
effective_type = section_type if section_type else note_type
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ (FA-09b)
# Wenn section_type vorhanden, nutze dessen retriever_weight
effective_rw = rw
if section_type:
effective_rw = _resolve_val(section_type, reg, "retriever_weight", rw)
try:
effective_rw = float(effective_rw)
except:
effective_rw = rw
pl: Dict[str, Any] = { pl: Dict[str, Any] = {
"note_id": nid or fm.get("id"), "note_id": nid or fm.get("id"),
"chunk_id": cid, "chunk_id": cid,
"title": title, "title": title,
"index": int(index), "index": int(index),
"ord": int(index) + 1, "ord": int(index) + 1,
"type": note_type, # WP-26 v1.0: type enthält den effektiven Typ (section_type || note_type)
"type": effective_type,
# WP-26 v1.0: note_type ist immer der ursprüngliche Note-Typ (für Filterung)
"note_type": note_type,
"tags": tags, "tags": tags,
"text": text, "text": text,
"window": window, "window": window,
@ -107,9 +129,13 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
"section": section, "section": section,
"path": note_path, "path": note_path,
"source_path": kwargs.get("file_path") or note_path, "source_path": kwargs.get("file_path") or note_path,
"retriever_weight": rw, # WP-26 v1.0: retriever_weight basiert auf effektivem Typ
"retriever_weight": effective_rw,
"chunk_profile": cp, "chunk_profile": cp,
"candidate_pool": candidate_pool # WP-24c v4.3.0: Kritisch für Chunk-Attribution "candidate_pool": candidate_pool, # WP-24c v4.3.0: Kritisch für Chunk-Attribution
# WP-26 v1.0: Optionale Felder für Section-Type-Tracking
"section_type": section_type, # Expliziter Section-Type (oder None)
"block_id": block_id, # Block-ID für Intra-Note-Links (oder None)
} }
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern) # Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)

View File

@ -46,16 +46,18 @@ class EdgeDTO(BaseModel):
target: str target: str
weight: float weight: float
direction: Literal["out", "in", "undirected"] = "out" direction: Literal["out", "in", "undirected"] = "out"
# WP-24c v4.5.3: Erweiterte Provenance-Werte für Chunk-Aware Edges # WP-26 v1.0: Provenance auf valide Literale reduziert (EdgeDTO-Constraint)
# Unterstützt alle tatsächlich verwendeten Provenance-Typen im System # Detail-Informationen werden über source_hint transportiert
provenance: Optional[Literal[ provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit"
"explicit", "rule", "smart", "structure", # WP-26 v1.0: Neues Feld für Detail-Informationen zur Herkunft
"explicit:callout", "explicit:wikilink", "explicit:note_zone", "explicit:note_scope", source_hint: Optional[Literal[
"inline:rel", "callout:edge", "semantic_ai", "structure:belongs_to", "structure:order", "callout", "wikilink", "inline_rel", "schema_default", "note_scope",
"derived:backlink", "edge_defaults", "global_pool" "note_zone", "belongs_to", "order", "backlink", "edge_defaults", "global_pool"
]] = "explicit" ]] = None
confidence: float = 1.0 confidence: float = 1.0
target_section: Optional[str] = None target_section: Optional[str] = None
# WP-26 v1.0: Flag für Intra-Note-Edges
is_internal: Optional[bool] = None
# --- Request Models --- # --- Request Models ---

View File

@ -0,0 +1,284 @@
# WP-26 Manuelle Testszenarien
**Version:** 1.0
**Datum:** 25. Januar 2026
**Status:** Phase 1 Implementierung abgeschlossen
---
## 1. Überblick
Dieses Dokument beschreibt die manuellen Testszenarien für WP-26 Phase 1: Section-Types und Intra-Note-Edges.
---
## 2. Voraussetzungen
1. **Python-Umgebung** mit allen Dependencies aus `requirements.txt`
2. **Qdrant-Instanz** erreichbar (lokal oder Docker)
3. **Vault mit Test-Note** (siehe Abschnitt 3)
---
## 3. Test-Note erstellen
Erstelle eine neue Markdown-Datei im Vault mit folgendem Inhalt:
```markdown
---
id: wp26-test-experience
title: WP-26 Test Experience
type: experience
tags: [test, wp26]
---
# WP-26 Test Experience
## Situation ^sit
> [!section] experience
Am 25. Januar 2026 testete ich das neue Section-Type Feature.
Dies ist der Experience-Teil der Note.
## Meine Reaktion ^react
> [!section] experience
> [!edge] followed_by
> [[#^sit]]
Ich war zunächst skeptisch, aber die Implementierung sah solide aus.
## Reflexion ^ref
> [!section] insight
Diese Erfahrung zeigt mir, dass typ-spezifische Sektionen
die semantische Präzision des Retrievals verbessern können.
> [!abstract] Semantic Edges
>> [!edge] derives
>> [[#^sit]]
>> [[#^react]]
## Nächste Schritte ^next
> [!section] decision
Ich werde:
1. Die Tests ausführen
2. Die Ergebnisse dokumentieren
> [!edge] caused_by
> [[#^ref]]
```
---
## 4. Testszenarien
### 4.1 TS-01: Section-Type-Erkennung
**Ziel:** Prüfen, ob `[!section]`-Callouts korrekt erkannt werden.
**Schritte:**
1. Importiere die Test-Note via `scripts/import_markdown.py`
2. Prüfe die Chunks in Qdrant via API oder Debug-Skript
**Prüfkriterien:**
| Chunk | Erwarteter `type` | Erwarteter `note_type` | Erwarteter `section` |
|-------|-------------------|------------------------|----------------------|
| #c00 | experience | experience | Situation |
| #c01 | experience | experience | Meine Reaktion |
| #c02 | insight | experience | Reflexion |
| #c03 | decision | experience | Nächste Schritte |
**Prüf-Script:**
```python
# scripts/check_wp26_chunks.py
from qdrant_client import QdrantClient
client = QdrantClient("http://localhost:6333")
note_id = "wp26-test-experience"
# Hole alle Chunks der Note
result = client.scroll(
collection_name="mindnet_chunks",
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
with_payload=True,
limit=100
)
for point in result[0]:
p = point.payload
print(f"Chunk: {p.get('chunk_id')}")
print(f" type: {p.get('type')}")
print(f" note_type: {p.get('note_type')}")
print(f" section: {p.get('section')}")
print(f" section_type: {p.get('section_type')}")
print(f" block_id: {p.get('block_id')}")
print()
```
---
### 4.2 TS-02: Block-ID-Erkennung
**Ziel:** Prüfen, ob Block-IDs (`^id`) aus Überschriften korrekt extrahiert werden.
**Prüfkriterien:**
| Chunk | Erwartete `block_id` |
|-------|---------------------|
| #c00 | sit |
| #c01 | react |
| #c02 | ref |
| #c03 | next |
---
### 4.3 TS-03: is_internal Flag für Edges
**Ziel:** Prüfen, ob Intra-Note-Edges das `is_internal: true` Flag erhalten.
**Schritte:**
1. Importiere die Test-Note
2. Prüfe die Edges in Qdrant
**Prüfkriterien:**
| Edge | `is_internal` |
|------|---------------|
| #c01#c00 (followed_by) | `true` |
| #c02#c00 (derives) | `true` |
| #c02#c01 (derives) | `true` |
| #c03#c02 (caused_by) | `true` |
| Alle structure edges (next/prev) | `true` |
**Prüf-Script:**
```python
# scripts/check_wp26_edges.py
from qdrant_client import QdrantClient
client = QdrantClient("http://localhost:6333")
note_id = "wp26-test-experience"
# Hole alle Edges der Note
result = client.scroll(
collection_name="mindnet_edges",
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
with_payload=True,
limit=100
)
for point in result[0]:
p = point.payload
kind = p.get('kind', 'unknown')
source = p.get('source_id', '?')
target = p.get('target_id', '?')
is_internal = p.get('is_internal', 'MISSING')
provenance = p.get('provenance', '?')
source_hint = p.get('source_hint', '-')
print(f"{source} --[{kind}]--> {target}")
print(f" is_internal: {is_internal}")
print(f" provenance: {provenance}")
print(f" source_hint: {source_hint}")
print()
```
---
### 4.4 TS-04: Provenance-Normalisierung
**Ziel:** Prüfen, ob Provenance-Werte korrekt normalisiert werden.
**Prüfkriterien:**
| Altes Provenance | Neues `provenance` | `source_hint` |
|------------------|-------------------|---------------|
| explicit:callout | explicit | callout |
| explicit:wikilink | explicit | wikilink |
| structure:belongs_to | structure | belongs_to |
| structure:order | structure | order |
| edge_defaults | rule | edge_defaults |
---
### 4.5 TS-05: Automatische Section-Erkennung
**Ziel:** Prüfen, ob neue Überschriften ohne `[!section]` automatisch neue Chunks erstellen.
**Test-Note:**
```markdown
---
id: wp26-test-auto-section
type: experience
---
# Test Auto Section
## Section A ^a
> [!section] insight
Content A (insight).
## Section B ^b
Content B (sollte experience sein - Fallback).
## Section C ^c
> [!section] decision
Content C (decision).
```
**Prüfkriterien:**
| Chunk | `type` | Grund |
|-------|--------|-------|
| Section A | insight | Explizites `[!section]` |
| Section B | experience | Fallback auf `note_type` |
| Section C | decision | Explizites `[!section]` |
---
## 5. Unit-Tests ausführen
```bash
# Im Projekt-Root
cd c:\Dev\cursor\mindnet
# Aktiviere virtuelle Umgebung (falls vorhanden)
# .venv\Scripts\activate
# Führe WP-26 Tests aus
python -m pytest tests/test_wp26_section_types.py -v
```
**Erwartetes Ergebnis:** Alle Tests grün.
---
## 6. Bekannte Einschränkungen
1. **Block-ID-Stability:** Obsidian aktualisiert Block-IDs nicht automatisch bei Umbenennung von Überschriften.
2. **Heading-Links:** Links wie `[[#Section Name]]` werden unterstützt, aber Block-References (`[[#^id]]`) werden bevorzugt.
3. **Nested Callouts:** Verschachtelte Callouts (`>> [!edge]`) werden korrekt verarbeitet.
---
## 7. Nächste Schritte (Phase 2)
Nach erfolgreicher Validierung von Phase 1:
1. **Retriever-Anpassung:** Path-Bonus für Intra-Note-Edges
2. **Graph-Exploration:** Navigation entlang `typical edges` aus `graph_schema.md`
3. **Schema-Validierung:** Agentic Validation gegen effektive Chunk-Typen
---
**Ende der Testdokumentation**

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,265 @@
"""
FILE: tests/test_wp26_section_types.py
DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
VERSION: 1.0.0
"""
import pytest
from app.core.chunking.chunking_parser import parse_blocks
from app.core.chunking.chunking_models import RawBlock, Chunk
from app.core.graph.graph_utils import normalize_provenance, _edge
class TestSectionTypeRecognition:
"""UT-01: Parser Section-Type-Erkennung"""
def test_section_type_recognition(self):
"""Testet, ob [!section]-Callouts korrekt erkannt werden."""
md = """
## Reflexion ^ref
> [!section] insight
Content here about insights.
"""
blocks, _ = parse_blocks(md)
# Finde den Paragraph-Block nach dem Section-Callout
paragraph_blocks = [b for b in blocks if b.kind == "paragraph"]
assert len(paragraph_blocks) >= 1
# Der Paragraph-Block sollte section_type "insight" haben
assert paragraph_blocks[0].section_type == "insight"
def test_section_type_with_block_id(self):
"""Testet, ob Block-IDs in Überschriften korrekt extrahiert werden."""
md = """
## Situation ^sit
> [!section] experience
Die Geschichte beginnt hier.
"""
blocks, _ = parse_blocks(md)
# Finde den Heading-Block
heading_blocks = [b for b in blocks if b.kind == "heading"]
assert len(heading_blocks) >= 1
# Block-ID sollte "sit" sein
assert heading_blocks[0].block_id == "sit"
class TestSectionTypeScope:
"""UT-02: Parser Scope-Beendigung"""
def test_section_type_scope_ends_at_same_level_heading(self):
"""Section-Type endet bei nächster H2."""
md = """
## Section A
> [!section] insight
Content A with insight.
## Section B
Content B without section callout.
"""
blocks, _ = parse_blocks(md)
# Finde Paragraph-Blöcke
paragraphs = [b for b in blocks if b.kind == "paragraph"]
# Erster Paragraph hat section_type "insight"
assert paragraphs[0].section_type == "insight"
# Zweiter Paragraph hat section_type None (Reset)
assert paragraphs[1].section_type is None
class TestProvenanceNormalization:
"""UT für Provenance-Normalisierung (WP-26 v1.0)"""
def test_normalize_explicit_callout(self):
"""explicit:callout -> (explicit, callout)"""
prov, hint = normalize_provenance("explicit:callout")
assert prov == "explicit"
assert hint == "callout"
def test_normalize_explicit_wikilink(self):
"""explicit:wikilink -> (explicit, wikilink)"""
prov, hint = normalize_provenance("explicit:wikilink")
assert prov == "explicit"
assert hint == "wikilink"
def test_normalize_structure_belongs_to(self):
"""structure:belongs_to -> (structure, belongs_to)"""
prov, hint = normalize_provenance("structure:belongs_to")
assert prov == "structure"
assert hint == "belongs_to"
def test_normalize_schema_default(self):
"""inferred:schema -> (rule, schema_default)"""
prov, hint = normalize_provenance("inferred:schema")
assert prov == "rule"
assert hint == "schema_default"
def test_normalize_unknown_fallback(self):
"""Unbekannte Provenance -> (explicit, None)"""
prov, hint = normalize_provenance("unknown_provenance")
assert prov == "explicit"
assert hint is None
class TestIsInternalFlag:
"""UT-13: is_internal Flag für Intra-Note-Edges"""
def test_is_internal_true_for_same_note(self):
"""Edges zwischen Chunks derselben Note haben is_internal=True"""
edge = _edge(
kind="derives",
scope="chunk",
source_id="note1#c01",
target_id="note1#c02",
note_id="note1"
)
assert edge["is_internal"] is True
def test_is_internal_false_for_different_notes(self):
"""Edges zwischen verschiedenen Notes haben is_internal=False"""
edge = _edge(
kind="references",
scope="chunk",
source_id="note1#c01",
target_id="note2#c01",
note_id="note1"
)
assert edge["is_internal"] is False
def test_is_internal_true_for_note_to_chunk(self):
"""Edges von Note zu eigenem Chunk haben is_internal=True"""
edge = _edge(
kind="belongs_to",
scope="chunk",
source_id="note1#c01",
target_id="note1",
note_id="note1"
)
assert edge["is_internal"] is True
class TestEdgeProvenanceInPayload:
"""Test für Provenance-Normalisierung in Edge-Payloads"""
def test_edge_provenance_normalized(self):
"""Provenance wird in Edge-Payloads normalisiert"""
edge = _edge(
kind="derives",
scope="chunk",
source_id="note1#c01",
target_id="note1#c02",
note_id="note1",
extra={"provenance": "explicit:callout"}
)
assert edge["provenance"] == "explicit"
assert edge["source_hint"] == "callout"
class TestAutomaticSectionRecognition:
"""UT-09: Automatische Section-Erkennung bei neuen Überschriften"""
def test_automatic_section_recognition_at_same_heading_level(self):
"""Neue Überschriften auf gleicher Ebene starten automatisch neue Sections"""
md = """
## Situation ^sit
> [!section] experience
Content A.
## Reflexion ^ref
Content B.
## Learnings ^learn
> [!section] insight
Content C.
## Ausblick ^out
Content D.
"""
blocks, _ = parse_blocks(md)
# Sammle alle Paragraph-Blöcke in Reihenfolge
paragraphs = [b for b in blocks if b.kind == "paragraph"]
assert len(paragraphs) == 4
# Chunk 1: Expliziter section_type "experience"
assert paragraphs[0].section_type == "experience"
# Chunk 2: Neue Section ohne Callout → None (Fallback auf note_type)
assert paragraphs[1].section_type is None
# Chunk 3: Expliziter section_type "insight"
assert paragraphs[2].section_type == "insight"
# Chunk 4: Neue Section ohne Callout → None (Fallback auf note_type)
assert paragraphs[3].section_type is None
class TestSeparateSectionCallout:
"""UT-10: Separates Section-Callout an beliebiger Stelle"""
def test_section_callout_separate_from_edge_callout(self):
"""Section-Callout kann separat von Edge-Callouts stehen"""
md = """
## Reflexion ^ref
Einleitender Text hier...
> [!section] insight
Weiterer normaler Inhalt...
> [!edge] derives
> [[#^sit]]
"""
blocks, _ = parse_blocks(md)
# Finde Paragraph-Blöcke nach dem Section-Callout
paragraphs = [b for b in blocks if b.kind == "paragraph"]
# Es sollten mindestens 2 Paragraphen geben
assert len(paragraphs) >= 2
# Der erste Paragraph hat noch keinen section_type (vor dem Callout)
# Der zweite Paragraph hat section_type "insight"
# Hinweis: Die genaue Zuordnung hängt von der Parser-Implementierung ab
section_types = [p.section_type for p in paragraphs]
assert "insight" in section_types
class TestNestedEdgeCallouts:
"""UT-08: Verschachtelte Edge-Callouts in Container"""
def test_nested_callouts_recognized(self):
"""Verschachtelte Callouts werden als Callout-Blöcke erkannt"""
md = """
> [!abstract] Semantic Edges
>> [!edge] derived_from
>> [[Target1#Section]]
>
>> [!edge] solves
>> [[Target2]]
"""
blocks, _ = parse_blocks(md)
# Finde Callout-Blöcke
callouts = [b for b in blocks if b.kind == "callout"]
# Es sollte mindestens ein Callout-Block erkannt werden
assert len(callouts) >= 1
if __name__ == "__main__":
pytest.main([__file__, "-v"])