Refactor provenance handling in EdgeDTO and graph utilities

- Updated provenance priorities and introduced a mapping from internal provenance values to EdgeDTO-compliant literals.
- Added a new function `normalize_provenance` to standardize internal provenance strings.
- Enhanced the `_edge` function to include an `is_internal` flag and provenance normalization.
- Modified the `EdgeDTO` model to include a new `source_hint` field for detailed provenance information and an `is_internal` flag for intra-note edges.
- Reduced the provenance options in `EdgeDTO` to valid literals, improving data integrity.
This commit is contained in:
Lars 2026-01-25 16:27:09 +01:00
parent 0d61a9e191
commit cc258008dc
9 changed files with 2337 additions and 52 deletions

View File

@ -1,13 +1,17 @@
"""
FILE: app/core/chunking/chunking_models.py
DESCRIPTION: Datenklassen für das Chunking-System.
WP-26 v1.0: Erweiterung um section_type für typ-spezifische Sektionen.
"""
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any
@dataclass
class RawBlock:
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing."""
"""
Repräsentiert einen logischen Block aus dem Markdown-Parsing.
WP-26 v1.0: Erweitert um section_type für typ-spezifische Sektionen.
"""
kind: str
text: str
level: Optional[int]
@ -15,10 +19,17 @@ class RawBlock:
section_title: Optional[str]
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
section_type: Optional[str] = None # z.B. "insight", "decision", "experience"
# WP-26 v1.0: Block-ID für Intra-Note-Links (z.B. "^sit" aus "## Situation ^sit")
block_id: Optional[str] = None
@dataclass
class Chunk:
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung."""
"""
Das finale Chunk-Objekt für Embedding und Graph-Speicherung.
WP-26 v1.0: Erweitert um section_type für effektiven Typ.
"""
id: str
note_id: str
index: int
@ -30,4 +41,9 @@ class Chunk:
neighbors_prev: Optional[str]
neighbors_next: Optional[str]
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
suggested_edges: Optional[List[str]] = None
suggested_edges: Optional[List[str]] = None
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
# Wenn gesetzt, wird dieser als "effektiver Typ" verwendet statt note_type
section_type: Optional[str] = None
# WP-26 v1.0: Block-ID für Intra-Note-Links
block_id: Optional[str] = None

View File

@ -5,16 +5,28 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
Stellt die Funktion parse_edges_robust zur Verfügung.
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
"""
import re
import os
import logging
from typing import List, Tuple, Set, Dict, Any, Optional
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text
logger = logging.getLogger(__name__)
_WS = re.compile(r'\s+')
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
# WP-26 v1.0: Pattern für [!section]-Callouts
# Matches: > [!section] type-name
_SECTION_CALLOUT_PATTERN = re.compile(r'^\s*>\s*\[!section\]\s*(\w+)', re.IGNORECASE)
# WP-26 v1.0: Pattern für Block-IDs in Überschriften
# Matches: ## Titel ^block-id
_BLOCK_ID_PATTERN = re.compile(r'\^([a-zA-Z0-9_-]+)\s*$')
def split_sentences(text: str) -> list[str]:
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
text = _WS.sub(' ', text.strip())
@ -27,12 +39,18 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
"""
blocks = []
h1_title = "Dokument"
section_path = "/"
current_section_title = None
# WP-26 v1.0: State-Machine für Section-Type-Tracking
current_section_type: Optional[str] = None # Aktueller Section-Type (oder None für note_type Fallback)
section_introduced_at_level: Optional[int] = None # Ebene, auf der erste Section eingeführt wurde
current_block_id: Optional[str] = None # Block-ID der aktuellen Sektion
# Frontmatter entfernen
fm, text_without_fm = extract_frontmatter_from_text(md_text)
@ -70,8 +88,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
buffer = []
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
# Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >)
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
# WP-26 v1.0: Erweitert um [!section]-Callouts
# Regex für Callouts: >\s*[!edge], >\s*[!abstract], >\s*[!section] (auch mit mehreren >)
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract|section)\]', re.IGNORECASE)
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
processed_indices = set()
@ -86,13 +105,39 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
# Prüfe, ob diese Zeile ein Callout startet
callout_match = callout_pattern.match(line)
if callout_match:
callout_type = callout_match.group(1).lower() # "edge", "abstract", oder "section"
# WP-26 v1.0: [!section] Callout-Behandlung
if callout_type == "section":
# Extrahiere Section-Type aus dem Callout
section_match = _SECTION_CALLOUT_PATTERN.match(line)
if section_match:
new_section_type = section_match.group(1).lower()
current_section_type = new_section_type
# Tracke die Ebene, auf der die erste Section eingeführt wurde
# Wir nehmen die Ebene der letzten Überschrift (section_path basiert)
if section_introduced_at_level is None:
# Bestimme Ebene aus section_path
# "/" = H1, "/Title" = H2, "/Title/Sub" = H3, etc.
path_depth = section_path.count('/') if section_path else 1
section_introduced_at_level = max(1, path_depth + 1)
logger.debug(f"WP-26: Section-Type erkannt: '{new_section_type}' bei '{current_section_title}' (Level: {section_introduced_at_level})")
# [!section] Callout wird nicht als Block hinzugefügt (ist nur Metadaten)
processed_indices.add(i)
continue
# Vorherigen Text-Block abschließen
if buffer:
content = "\n".join(buffer).strip()
if content:
blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone
exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
))
buffer = []
@ -120,7 +165,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
blocks.append(RawBlock(
"callout", callout_content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
is_meta_content=True # WP-24c v4.2.6: Markierung für spätere Entfernung
is_meta_content=True, # WP-24c v4.2.6: Markierung für spätere Entfernung
section_type=current_section_type,
block_id=current_block_id
))
continue
@ -133,13 +180,32 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
if content:
blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone
exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
))
buffer = []
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
# WP-26 v1.0: Block-ID aus Überschrift extrahieren (z.B. "## Titel ^block-id")
block_id_match = _BLOCK_ID_PATTERN.search(title)
if block_id_match:
current_block_id = block_id_match.group(1)
# Entferne Block-ID aus dem Titel für saubere Anzeige
title = _BLOCK_ID_PATTERN.sub('', title).strip()
else:
current_block_id = None
# WP-26 v1.0: Section-Type State-Machine
# Wenn eine Section eingeführt wurde und wir auf gleicher oder höherer Ebene sind:
# -> Automatisch neue Section erkennen (FA-02b)
if section_introduced_at_level is not None and level <= section_introduced_at_level:
# Neue Überschrift auf gleicher oder höherer Ebene -> Reset auf None (note_type Fallback)
current_section_type = None
logger.debug(f"WP-26: Neue Section erkannt bei H{level} '{title}' -> Reset auf note_type")
# WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
is_llm_validation_zone = (
level == llm_validation_level and
@ -170,7 +236,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
# Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
blocks.append(RawBlock(
"heading", stripped, level, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone
exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
))
continue
@ -181,13 +249,17 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
if content:
blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone
exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
))
buffer = []
if stripped == "---":
blocks.append(RawBlock(
"separator", "---", None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone
exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
))
else:
buffer.append(line)
@ -197,7 +269,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
if content:
blocks.append(RawBlock(
"paragraph", content, None, section_path, current_section_title,
exclude_from_chunking=in_exclusion_zone
exclude_from_chunking=in_exclusion_zone,
section_type=current_section_type,
block_id=current_block_id
))
return blocks, h1_title

View File

@ -6,6 +6,7 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
"""
from typing import List, Dict, Any, Optional
from .chunking_models import RawBlock, Chunk
@ -36,41 +37,70 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
chunks: List[Chunk] = []
def _emit(txt, title, path):
"""Schreibt den finalen Chunk ohne Text-Modifikationen."""
def _emit(txt, title, path, section_type=None, block_id=None):
"""
Schreibt den finalen Chunk ohne Text-Modifikationen.
WP-26 v1.0: Erweitert um section_type und block_id.
"""
idx = len(chunks)
win = _create_win(context_prefix, title, txt)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None,
section_type=section_type, block_id=block_id
))
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
# WP-26 v1.0: Erweitert um section_type und block_id Tracking
sections: List[Dict[str, Any]] = []
curr_blocks = []
for b in blocks:
if b.kind == "heading" and b.level <= split_level:
if curr_blocks:
# WP-26 v1.0: Finde den effektiven section_type und block_id für diese Sektion
# Priorisiere den ersten Block mit section_type, sonst den Heading-Block
effective_section_type = None
effective_block_id = None
for cb in curr_blocks:
if cb.section_type and effective_section_type is None:
effective_section_type = cb.section_type
if cb.block_id and effective_block_id is None:
effective_block_id = cb.block_id
sections.append({
"text": "\n\n".join([x.text for x in curr_blocks]),
"meta": curr_blocks[0],
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
"section_type": effective_section_type,
"block_id": effective_block_id
})
curr_blocks = [b]
else:
curr_blocks.append(b)
if curr_blocks:
# WP-26 v1.0: Gleiche Logik für den letzten Block
effective_section_type = None
effective_block_id = None
for cb in curr_blocks:
if cb.section_type and effective_section_type is None:
effective_section_type = cb.section_type
if cb.block_id and effective_block_id is None:
effective_block_id = cb.block_id
sections.append({
"text": "\n\n".join([x.text for x in curr_blocks]),
"meta": curr_blocks[0],
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
"section_type": effective_section_type,
"block_id": effective_block_id
})
# --- SCHRITT 2: Verarbeitung der Queue ---
queue = list(sections)
current_chunk_text = ""
current_meta = {"title": None, "path": "/"}
# WP-26 v1.0: Erweitert um section_type und block_id
current_meta = {"title": None, "path": "/", "section_type": None, "block_id": None}
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
is_hard_split_mode = (not smart_edge) or (strict)
@ -83,6 +113,9 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
if not current_chunk_text:
current_meta["title"] = item["meta"].section_title
current_meta["path"] = item["meta"].section_path
# WP-26 v1.0: section_type und block_id aus Item übernehmen
current_meta["section_type"] = item.get("section_type")
current_meta["block_id"] = item.get("block_id")
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
if is_hard_split_mode:
@ -90,18 +123,23 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften
if current_chunk_text:
# Flashe vorherigen Chunk
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
current_chunk_text = ""
# Neue Sektion: Initialisiere Meta
current_meta["title"] = item["meta"].section_title
current_meta["path"] = item["meta"].section_path
# WP-26 v1.0: section_type und block_id aus Item übernehmen
current_meta["section_type"] = item.get("section_type")
current_meta["block_id"] = item.get("block_id")
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
# (nur Überschrift, kein Inhalt)
if item.get("is_empty", False):
# Leere Sektion: Nur Überschrift als Chunk
_emit(item_text, current_meta["title"], current_meta["path"])
_emit(item_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
else:
# Normale Sektion: Prüfe auf Token-Limit
if estimate_tokens(item_text) > max_tokens:
@ -113,16 +151,19 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
while sents:
s = sents.pop(0); slen = estimate_tokens(s)
if take_len + slen > target and take_sents:
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
take_sents = [s]; take_len = slen
else:
take_sents.append(s); take_len += slen
if take_sents:
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
else:
# Sektion passt: Direkt als Chunk
_emit(item_text, current_meta["title"], current_meta["path"])
_emit(item_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
current_chunk_text = ""
continue
@ -137,7 +178,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
else:
if current_chunk_text:
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
current_chunk_text = ""
queue.insert(0, item)
else:
@ -152,7 +194,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
sents.insert(0, s); break
take_sents.append(s); take_len += slen
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
if sents:
remainder = " ".join(sents)
@ -160,15 +203,21 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
if header_prefix and not remainder.startswith(header_prefix):
remainder = header_prefix + "\n\n" + remainder
# Carry-Over: Rest wird vorne in die Queue geschoben
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True})
# WP-26 v1.0: section_type und block_id weitergeben
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True,
"section_type": item.get("section_type"), "block_id": item.get("block_id")})
if current_chunk_text:
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
current_meta["section_type"], current_meta["block_id"])
return chunks
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus."""
"""
Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
"""
target = config.get("target", 400); max_tokens = config.get("max", 600)
chunks: List[Chunk] = []; buf: List[RawBlock] = []
@ -178,13 +227,31 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
if curr_tokens + b_tokens > max_tokens and buf:
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
win = _create_win(context_prefix, buf[0].section_title, txt)
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=curr_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
# WP-26 v1.0: Finde effektiven section_type und block_id
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=curr_tokens,
section_title=buf[0].section_title, section_path=buf[0].section_path,
neighbors_prev=None, neighbors_next=None,
section_type=effective_section_type, block_id=effective_block_id
))
buf = []
buf.append(b)
if buf:
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
win = _create_win(context_prefix, buf[0].section_title, txt)
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
# WP-26 v1.0: Finde effektiven section_type und block_id
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
chunks.append(Chunk(
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
text=txt, window=win, token_count=estimate_tokens(txt),
section_title=buf[0].section_title, section_path=buf[0].section_path,
neighbors_prev=None, neighbors_next=None,
section_type=effective_section_type, block_id=effective_block_id
))
return chunks

View File

@ -12,28 +12,85 @@ STATUS: Active
import os
import uuid
import hashlib
from typing import Iterable, List, Optional, Set, Any, Tuple
from typing import Dict, Iterable, List, Optional, Set, Any, Tuple
try:
import yaml
except ImportError:
yaml = None
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft
# WP-26 v1.0: Provenance-Literale auf valide EdgeDTO-Werte reduziert
# Legacy-Prioritäten für interne Verarbeitung (werden zu source_hint gemappt)
PROVENANCE_PRIORITY = {
# Explizite Kanten (provenance: "explicit")
"explicit:wikilink": 1.00,
"inline:rel": 0.95,
"callout:edge": 0.90,
"explicit:callout": 0.90, # WP-24c v4.2.7: Callout-Kanten aus candidate_pool
"semantic_ai": 0.90, # Validierte KI-Kanten
"structure:belongs_to": 1.00,
"structure:order": 0.95, # next/prev
"explicit:callout": 0.90,
"explicit:note_scope": 1.00,
"explicit:note_zone": 1.00, # WP-24c v4.2.0: Note-Scope Zonen (höchste Priorität)
"explicit:note_zone": 1.00,
# Regel-basierte Kanten (provenance: "rule")
"derived:backlink": 0.90,
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
"edge_defaults": 0.70,
"schema_default": 0.85,
# Struktur-Kanten (provenance: "structure")
"structure:belongs_to": 1.00,
"structure:order": 0.95,
# KI-generierte Kanten (provenance: "smart")
"semantic_ai": 0.90,
"global_pool": 0.80,
}
# WP-26 v1.0: Mapping von internen Provenance-Werten zu EdgeDTO-konformen Literalen
PROVENANCE_TO_DTO = {
# explicit
"explicit:wikilink": ("explicit", "wikilink"),
"explicit:callout": ("explicit", "callout"),
"explicit:note_scope": ("explicit", "note_scope"),
"explicit:note_zone": ("explicit", "note_zone"),
"inline:rel": ("explicit", "inline_rel"),
"callout:edge": ("explicit", "callout"),
"explicit": ("explicit", None),
# rule
"derived:backlink": ("rule", "backlink"),
"edge_defaults": ("rule", "edge_defaults"),
"schema_default": ("rule", "schema_default"),
"inferred:schema": ("rule", "schema_default"),
"rule": ("rule", None),
# structure
"structure:belongs_to": ("structure", "belongs_to"),
"structure:order": ("structure", "order"),
"structure": ("structure", None),
# smart
"semantic_ai": ("smart", None),
"global_pool": ("smart", "global_pool"),
"smart": ("smart", None),
}
def normalize_provenance(internal_provenance: str) -> Tuple[str, Optional[str]]:
"""
WP-26 v1.0: Normalisiert interne Provenance-Werte zu EdgeDTO-konformen Literalen.
Args:
internal_provenance: Interner Provenance-String (z.B. "explicit:callout")
Returns:
Tuple (provenance, source_hint) mit validen EdgeDTO-Werten
"""
if internal_provenance in PROVENANCE_TO_DTO:
return PROVENANCE_TO_DTO[internal_provenance]
# Fallback: Versuche Präfix-Matching
if internal_provenance.startswith("explicit"):
return ("explicit", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
if internal_provenance.startswith("structure"):
return ("structure", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
if internal_provenance.startswith("rule") or internal_provenance.startswith("derived"):
return ("rule", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
# Default: explicit ohne source_hint
return ("explicit", None)
# ---------------------------------------------------------------------------
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
# ---------------------------------------------------------------------------
@ -123,7 +180,15 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
"""
Konstruiert ein standardisiertes Kanten-Payload für Qdrant.
Wird von graph_derive_edges.py benötigt.
WP-26 v1.0: Erweitert um is_internal Flag und Provenance-Normalisierung.
Args:
kind: Kantentyp (z.B. "derives", "caused_by")
scope: Granularität ("chunk" oder "note")
source_id: ID der Quelle (Chunk oder Note)
target_id: ID des Ziels (Chunk oder Note)
note_id: ID der Note (für Kontext)
extra: Zusätzliche Payload-Felder
"""
pl = {
"kind": kind,
@ -134,8 +199,24 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
"note_id": note_id,
"virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt
}
# WP-26 v1.0: is_internal Flag berechnen
# Intra-Note-Edge: Source und Target gehören zur gleichen Note
source_note = source_id.split("#")[0] if "#" in source_id else source_id
target_note = target_id.split("#")[0] if "#" in target_id else target_id
pl["is_internal"] = (source_note == target_note) or (source_note == note_id and target_note == note_id)
if extra:
pl.update(extra)
# WP-26 v1.0: Provenance normalisieren, falls vorhanden
if "provenance" in extra:
internal_prov = extra["provenance"]
dto_prov, source_hint = normalize_provenance(internal_prov)
pl["provenance"] = dto_prov
if source_hint:
pl["source_hint"] = source_hint
return pl
# ---------------------------------------------------------------------------

View File

@ -3,7 +3,8 @@ FILE: app/core/ingestion/ingestion_chunk_payload.py
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution.
VERSION: 2.4.4 (WP-24c v4.3.0)
WP-26 v1.0: Erweiterung um effective_type (section_type || note_type) und note_type-Feld.
VERSION: 2.5.0 (WP-26 v1.0)
STATUS: Active
"""
from __future__ import annotations
@ -91,14 +92,35 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "")
# WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution
candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", [])
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
section_type = getattr(ch, "section_type", None) if not is_dict else ch.get("section_type")
# WP-26 v1.0: Block-ID für Intra-Note-Links
block_id = getattr(ch, "block_id", None) if not is_dict else ch.get("block_id")
# WP-26 v1.0: Effektiver Typ = section_type || note_type (FA-03)
effective_type = section_type if section_type else note_type
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ (FA-09b)
# Wenn section_type vorhanden, nutze dessen retriever_weight
effective_rw = rw
if section_type:
effective_rw = _resolve_val(section_type, reg, "retriever_weight", rw)
try:
effective_rw = float(effective_rw)
except:
effective_rw = rw
pl: Dict[str, Any] = {
"note_id": nid or fm.get("id"),
"chunk_id": cid,
"title": title,
"index": int(index),
"ord": int(index) + 1,
"type": note_type,
# WP-26 v1.0: type enthält den effektiven Typ (section_type || note_type)
"type": effective_type,
# WP-26 v1.0: note_type ist immer der ursprüngliche Note-Typ (für Filterung)
"note_type": note_type,
"tags": tags,
"text": text,
"window": window,
@ -107,9 +129,13 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
"section": section,
"path": note_path,
"source_path": kwargs.get("file_path") or note_path,
"retriever_weight": rw,
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ
"retriever_weight": effective_rw,
"chunk_profile": cp,
"candidate_pool": candidate_pool # WP-24c v4.3.0: Kritisch für Chunk-Attribution
"candidate_pool": candidate_pool, # WP-24c v4.3.0: Kritisch für Chunk-Attribution
# WP-26 v1.0: Optionale Felder für Section-Type-Tracking
"section_type": section_type, # Expliziter Section-Type (oder None)
"block_id": block_id, # Block-ID für Intra-Note-Links (oder None)
}
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)

View File

@ -46,16 +46,18 @@ class EdgeDTO(BaseModel):
target: str
weight: float
direction: Literal["out", "in", "undirected"] = "out"
# WP-24c v4.5.3: Erweiterte Provenance-Werte für Chunk-Aware Edges
# Unterstützt alle tatsächlich verwendeten Provenance-Typen im System
provenance: Optional[Literal[
"explicit", "rule", "smart", "structure",
"explicit:callout", "explicit:wikilink", "explicit:note_zone", "explicit:note_scope",
"inline:rel", "callout:edge", "semantic_ai", "structure:belongs_to", "structure:order",
"derived:backlink", "edge_defaults", "global_pool"
]] = "explicit"
# WP-26 v1.0: Provenance auf valide Literale reduziert (EdgeDTO-Constraint)
# Detail-Informationen werden über source_hint transportiert
provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit"
# WP-26 v1.0: Neues Feld für Detail-Informationen zur Herkunft
source_hint: Optional[Literal[
"callout", "wikilink", "inline_rel", "schema_default", "note_scope",
"note_zone", "belongs_to", "order", "backlink", "edge_defaults", "global_pool"
]] = None
confidence: float = 1.0
target_section: Optional[str] = None
target_section: Optional[str] = None
# WP-26 v1.0: Flag für Intra-Note-Edges
is_internal: Optional[bool] = None
# --- Request Models ---

View File

@ -0,0 +1,284 @@
# WP-26 Manuelle Testszenarien
**Version:** 1.0
**Datum:** 25. Januar 2026
**Status:** Phase 1 Implementierung abgeschlossen
---
## 1. Überblick
Dieses Dokument beschreibt die manuellen Testszenarien für WP-26 Phase 1: Section-Types und Intra-Note-Edges.
---
## 2. Voraussetzungen
1. **Python-Umgebung** mit allen Dependencies aus `requirements.txt`
2. **Qdrant-Instanz** erreichbar (lokal oder Docker)
3. **Vault mit Test-Note** (siehe Abschnitt 3)
---
## 3. Test-Note erstellen
Erstelle eine neue Markdown-Datei im Vault mit folgendem Inhalt:
```markdown
---
id: wp26-test-experience
title: WP-26 Test Experience
type: experience
tags: [test, wp26]
---
# WP-26 Test Experience
## Situation ^sit
> [!section] experience
Am 25. Januar 2026 testete ich das neue Section-Type Feature.
Dies ist der Experience-Teil der Note.
## Meine Reaktion ^react
> [!section] experience
> [!edge] followed_by
> [[#^sit]]
Ich war zunächst skeptisch, aber die Implementierung sah solide aus.
## Reflexion ^ref
> [!section] insight
Diese Erfahrung zeigt mir, dass typ-spezifische Sektionen
die semantische Präzision des Retrievals verbessern können.
> [!abstract] Semantic Edges
>> [!edge] derives
>> [[#^sit]]
>> [[#^react]]
## Nächste Schritte ^next
> [!section] decision
Ich werde:
1. Die Tests ausführen
2. Die Ergebnisse dokumentieren
> [!edge] caused_by
> [[#^ref]]
```
---
## 4. Testszenarien
### 4.1 TS-01: Section-Type-Erkennung
**Ziel:** Prüfen, ob `[!section]`-Callouts korrekt erkannt werden.
**Schritte:**
1. Importiere die Test-Note via `scripts/import_markdown.py`
2. Prüfe die Chunks in Qdrant via API oder Debug-Skript
**Prüfkriterien:**
| Chunk | Erwarteter `type` | Erwarteter `note_type` | Erwarteter `section` |
|-------|-------------------|------------------------|----------------------|
| #c00 | experience | experience | Situation |
| #c01 | experience | experience | Meine Reaktion |
| #c02 | insight | experience | Reflexion |
| #c03 | decision | experience | Nächste Schritte |
**Prüf-Script:**
```python
# scripts/check_wp26_chunks.py
from qdrant_client import QdrantClient
client = QdrantClient("http://localhost:6333")
note_id = "wp26-test-experience"
# Hole alle Chunks der Note
result = client.scroll(
collection_name="mindnet_chunks",
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
with_payload=True,
limit=100
)
for point in result[0]:
p = point.payload
print(f"Chunk: {p.get('chunk_id')}")
print(f" type: {p.get('type')}")
print(f" note_type: {p.get('note_type')}")
print(f" section: {p.get('section')}")
print(f" section_type: {p.get('section_type')}")
print(f" block_id: {p.get('block_id')}")
print()
```
---
### 4.2 TS-02: Block-ID-Erkennung
**Ziel:** Prüfen, ob Block-IDs (`^id`) aus Überschriften korrekt extrahiert werden.
**Prüfkriterien:**
| Chunk | Erwartete `block_id` |
|-------|---------------------|
| #c00 | sit |
| #c01 | react |
| #c02 | ref |
| #c03 | next |
---
### 4.3 TS-03: is_internal Flag für Edges
**Ziel:** Prüfen, ob Intra-Note-Edges das `is_internal: true` Flag erhalten.
**Schritte:**
1. Importiere die Test-Note
2. Prüfe die Edges in Qdrant
**Prüfkriterien:**
| Edge | `is_internal` |
|------|---------------|
| #c01#c00 (followed_by) | `true` |
| #c02#c00 (derives) | `true` |
| #c02#c01 (derives) | `true` |
| #c03#c02 (caused_by) | `true` |
| Alle structure edges (next/prev) | `true` |
**Prüf-Script:**
```python
# scripts/check_wp26_edges.py
from qdrant_client import QdrantClient
client = QdrantClient("http://localhost:6333")
note_id = "wp26-test-experience"
# Hole alle Edges der Note
result = client.scroll(
collection_name="mindnet_edges",
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
with_payload=True,
limit=100
)
for point in result[0]:
p = point.payload
kind = p.get('kind', 'unknown')
source = p.get('source_id', '?')
target = p.get('target_id', '?')
is_internal = p.get('is_internal', 'MISSING')
provenance = p.get('provenance', '?')
source_hint = p.get('source_hint', '-')
print(f"{source} --[{kind}]--> {target}")
print(f" is_internal: {is_internal}")
print(f" provenance: {provenance}")
print(f" source_hint: {source_hint}")
print()
```
---
### 4.4 TS-04: Provenance-Normalisierung
**Ziel:** Prüfen, ob Provenance-Werte korrekt normalisiert werden.
**Prüfkriterien:**
| Altes Provenance | Neues `provenance` | `source_hint` |
|------------------|-------------------|---------------|
| explicit:callout | explicit | callout |
| explicit:wikilink | explicit | wikilink |
| structure:belongs_to | structure | belongs_to |
| structure:order | structure | order |
| edge_defaults | rule | edge_defaults |
---
### 4.5 TS-05: Automatische Section-Erkennung
**Ziel:** Prüfen, ob neue Überschriften ohne `[!section]` automatisch neue Chunks erstellen.
**Test-Note:**
```markdown
---
id: wp26-test-auto-section
type: experience
---
# Test Auto Section
## Section A ^a
> [!section] insight
Content A (insight).
## Section B ^b
Content B (sollte experience sein - Fallback).
## Section C ^c
> [!section] decision
Content C (decision).
```
**Prüfkriterien:**
| Chunk | `type` | Grund |
|-------|--------|-------|
| Section A | insight | Explizites `[!section]` |
| Section B | experience | Fallback auf `note_type` |
| Section C | decision | Explizites `[!section]` |
---
## 5. Unit-Tests ausführen
```bash
# Im Projekt-Root
cd c:\Dev\cursor\mindnet
# Aktiviere virtuelle Umgebung (falls vorhanden)
# .venv\Scripts\activate
# Führe WP-26 Tests aus
python -m pytest tests/test_wp26_section_types.py -v
```
**Erwartetes Ergebnis:** Alle Tests grün.
---
## 6. Bekannte Einschränkungen
1. **Block-ID-Stability:** Obsidian aktualisiert Block-IDs nicht automatisch bei Umbenennung von Überschriften.
2. **Heading-Links:** Links wie `[[#Section Name]]` werden unterstützt, aber Block-References (`[[#^id]]`) werden bevorzugt.
3. **Nested Callouts:** Verschachtelte Callouts (`>> [!edge]`) werden korrekt verarbeitet.
---
## 7. Nächste Schritte (Phase 2)
Nach erfolgreicher Validierung von Phase 1:
1. **Retriever-Anpassung:** Path-Bonus für Intra-Note-Edges
2. **Graph-Exploration:** Navigation entlang `typical edges` aus `graph_schema.md`
3. **Schema-Validierung:** Agentic Validation gegen effektive Chunk-Typen
---
**Ende der Testdokumentation**

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,265 @@
"""
FILE: tests/test_wp26_section_types.py
DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
VERSION: 1.0.0
"""
import pytest
from app.core.chunking.chunking_parser import parse_blocks
from app.core.chunking.chunking_models import RawBlock, Chunk
from app.core.graph.graph_utils import normalize_provenance, _edge
class TestSectionTypeRecognition:
"""UT-01: Parser Section-Type-Erkennung"""
def test_section_type_recognition(self):
"""Testet, ob [!section]-Callouts korrekt erkannt werden."""
md = """
## Reflexion ^ref
> [!section] insight
Content here about insights.
"""
blocks, _ = parse_blocks(md)
# Finde den Paragraph-Block nach dem Section-Callout
paragraph_blocks = [b for b in blocks if b.kind == "paragraph"]
assert len(paragraph_blocks) >= 1
# Der Paragraph-Block sollte section_type "insight" haben
assert paragraph_blocks[0].section_type == "insight"
def test_section_type_with_block_id(self):
"""Testet, ob Block-IDs in Überschriften korrekt extrahiert werden."""
md = """
## Situation ^sit
> [!section] experience
Die Geschichte beginnt hier.
"""
blocks, _ = parse_blocks(md)
# Finde den Heading-Block
heading_blocks = [b for b in blocks if b.kind == "heading"]
assert len(heading_blocks) >= 1
# Block-ID sollte "sit" sein
assert heading_blocks[0].block_id == "sit"
class TestSectionTypeScope:
"""UT-02: Parser Scope-Beendigung"""
def test_section_type_scope_ends_at_same_level_heading(self):
"""Section-Type endet bei nächster H2."""
md = """
## Section A
> [!section] insight
Content A with insight.
## Section B
Content B without section callout.
"""
blocks, _ = parse_blocks(md)
# Finde Paragraph-Blöcke
paragraphs = [b for b in blocks if b.kind == "paragraph"]
# Erster Paragraph hat section_type "insight"
assert paragraphs[0].section_type == "insight"
# Zweiter Paragraph hat section_type None (Reset)
assert paragraphs[1].section_type is None
class TestProvenanceNormalization:
"""UT für Provenance-Normalisierung (WP-26 v1.0)"""
def test_normalize_explicit_callout(self):
"""explicit:callout -> (explicit, callout)"""
prov, hint = normalize_provenance("explicit:callout")
assert prov == "explicit"
assert hint == "callout"
def test_normalize_explicit_wikilink(self):
"""explicit:wikilink -> (explicit, wikilink)"""
prov, hint = normalize_provenance("explicit:wikilink")
assert prov == "explicit"
assert hint == "wikilink"
def test_normalize_structure_belongs_to(self):
"""structure:belongs_to -> (structure, belongs_to)"""
prov, hint = normalize_provenance("structure:belongs_to")
assert prov == "structure"
assert hint == "belongs_to"
def test_normalize_schema_default(self):
"""inferred:schema -> (rule, schema_default)"""
prov, hint = normalize_provenance("inferred:schema")
assert prov == "rule"
assert hint == "schema_default"
def test_normalize_unknown_fallback(self):
"""Unbekannte Provenance -> (explicit, None)"""
prov, hint = normalize_provenance("unknown_provenance")
assert prov == "explicit"
assert hint is None
class TestIsInternalFlag:
"""UT-13: is_internal Flag für Intra-Note-Edges"""
def test_is_internal_true_for_same_note(self):
"""Edges zwischen Chunks derselben Note haben is_internal=True"""
edge = _edge(
kind="derives",
scope="chunk",
source_id="note1#c01",
target_id="note1#c02",
note_id="note1"
)
assert edge["is_internal"] is True
def test_is_internal_false_for_different_notes(self):
"""Edges zwischen verschiedenen Notes haben is_internal=False"""
edge = _edge(
kind="references",
scope="chunk",
source_id="note1#c01",
target_id="note2#c01",
note_id="note1"
)
assert edge["is_internal"] is False
def test_is_internal_true_for_note_to_chunk(self):
"""Edges von Note zu eigenem Chunk haben is_internal=True"""
edge = _edge(
kind="belongs_to",
scope="chunk",
source_id="note1#c01",
target_id="note1",
note_id="note1"
)
assert edge["is_internal"] is True
class TestEdgeProvenanceInPayload:
"""Test für Provenance-Normalisierung in Edge-Payloads"""
def test_edge_provenance_normalized(self):
"""Provenance wird in Edge-Payloads normalisiert"""
edge = _edge(
kind="derives",
scope="chunk",
source_id="note1#c01",
target_id="note1#c02",
note_id="note1",
extra={"provenance": "explicit:callout"}
)
assert edge["provenance"] == "explicit"
assert edge["source_hint"] == "callout"
class TestAutomaticSectionRecognition:
"""UT-09: Automatische Section-Erkennung bei neuen Überschriften"""
def test_automatic_section_recognition_at_same_heading_level(self):
"""Neue Überschriften auf gleicher Ebene starten automatisch neue Sections"""
md = """
## Situation ^sit
> [!section] experience
Content A.
## Reflexion ^ref
Content B.
## Learnings ^learn
> [!section] insight
Content C.
## Ausblick ^out
Content D.
"""
blocks, _ = parse_blocks(md)
# Sammle alle Paragraph-Blöcke in Reihenfolge
paragraphs = [b for b in blocks if b.kind == "paragraph"]
assert len(paragraphs) == 4
# Chunk 1: Expliziter section_type "experience"
assert paragraphs[0].section_type == "experience"
# Chunk 2: Neue Section ohne Callout → None (Fallback auf note_type)
assert paragraphs[1].section_type is None
# Chunk 3: Expliziter section_type "insight"
assert paragraphs[2].section_type == "insight"
# Chunk 4: Neue Section ohne Callout → None (Fallback auf note_type)
assert paragraphs[3].section_type is None
class TestSeparateSectionCallout:
"""UT-10: Separates Section-Callout an beliebiger Stelle"""
def test_section_callout_separate_from_edge_callout(self):
"""Section-Callout kann separat von Edge-Callouts stehen"""
md = """
## Reflexion ^ref
Einleitender Text hier...
> [!section] insight
Weiterer normaler Inhalt...
> [!edge] derives
> [[#^sit]]
"""
blocks, _ = parse_blocks(md)
# Finde Paragraph-Blöcke nach dem Section-Callout
paragraphs = [b for b in blocks if b.kind == "paragraph"]
# Es sollten mindestens 2 Paragraphen geben
assert len(paragraphs) >= 2
# Der erste Paragraph hat noch keinen section_type (vor dem Callout)
# Der zweite Paragraph hat section_type "insight"
# Hinweis: Die genaue Zuordnung hängt von der Parser-Implementierung ab
section_types = [p.section_type for p in paragraphs]
assert "insight" in section_types
class TestNestedEdgeCallouts:
"""UT-08: Verschachtelte Edge-Callouts in Container"""
def test_nested_callouts_recognized(self):
"""Verschachtelte Callouts werden als Callout-Blöcke erkannt"""
md = """
> [!abstract] Semantic Edges
>> [!edge] derived_from
>> [[Target1#Section]]
>
>> [!edge] solves
>> [[Target2]]
"""
blocks, _ = parse_blocks(md)
# Finde Callout-Blöcke
callouts = [b for b in blocks if b.kind == "callout"]
# Es sollte mindestens ein Callout-Block erkannt werden
assert len(callouts) >= 1
if __name__ == "__main__":
pytest.main([__file__, "-v"])