Refactor provenance handling in EdgeDTO and graph utilities
- Updated provenance priorities and introduced a mapping from internal provenance values to EdgeDTO-compliant literals. - Added a new function `normalize_provenance` to standardize internal provenance strings. - Enhanced the `_edge` function to include an `is_internal` flag and provenance normalization. - Modified the `EdgeDTO` model to include a new `source_hint` field for detailed provenance information and an `is_internal` flag for intra-note edges. - Reduced the provenance options in `EdgeDTO` to valid literals, improving data integrity.
This commit is contained in:
parent
0d61a9e191
commit
cc258008dc
|
|
@ -1,13 +1,17 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_models.py
|
||||
DESCRIPTION: Datenklassen für das Chunking-System.
|
||||
WP-26 v1.0: Erweiterung um section_type für typ-spezifische Sektionen.
|
||||
"""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Optional, Any
|
||||
|
||||
@dataclass
|
||||
class RawBlock:
|
||||
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing."""
|
||||
"""
|
||||
Repräsentiert einen logischen Block aus dem Markdown-Parsing.
|
||||
WP-26 v1.0: Erweitert um section_type für typ-spezifische Sektionen.
|
||||
"""
|
||||
kind: str
|
||||
text: str
|
||||
level: Optional[int]
|
||||
|
|
@ -15,10 +19,17 @@ class RawBlock:
|
|||
section_title: Optional[str]
|
||||
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
|
||||
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
|
||||
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
|
||||
section_type: Optional[str] = None # z.B. "insight", "decision", "experience"
|
||||
# WP-26 v1.0: Block-ID für Intra-Note-Links (z.B. "^sit" aus "## Situation ^sit")
|
||||
block_id: Optional[str] = None
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung."""
|
||||
"""
|
||||
Das finale Chunk-Objekt für Embedding und Graph-Speicherung.
|
||||
WP-26 v1.0: Erweitert um section_type für effektiven Typ.
|
||||
"""
|
||||
id: str
|
||||
note_id: str
|
||||
index: int
|
||||
|
|
@ -30,4 +41,9 @@ class Chunk:
|
|||
neighbors_prev: Optional[str]
|
||||
neighbors_next: Optional[str]
|
||||
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
||||
suggested_edges: Optional[List[str]] = None
|
||||
suggested_edges: Optional[List[str]] = None
|
||||
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
|
||||
# Wenn gesetzt, wird dieser als "effektiver Typ" verwendet statt note_type
|
||||
section_type: Optional[str] = None
|
||||
# WP-26 v1.0: Block-ID für Intra-Note-Links
|
||||
block_id: Optional[str] = None
|
||||
|
|
@ -5,16 +5,28 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
|||
Stellt die Funktion parse_edges_robust zur Verfügung.
|
||||
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
|
||||
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
|
||||
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
from typing import List, Tuple, Set, Dict, Any, Optional
|
||||
from .chunking_models import RawBlock
|
||||
from .chunking_utils import extract_frontmatter_from_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_WS = re.compile(r'\s+')
|
||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||
|
||||
# WP-26 v1.0: Pattern für [!section]-Callouts
|
||||
# Matches: > [!section] type-name
|
||||
_SECTION_CALLOUT_PATTERN = re.compile(r'^\s*>\s*\[!section\]\s*(\w+)', re.IGNORECASE)
|
||||
|
||||
# WP-26 v1.0: Pattern für Block-IDs in Überschriften
|
||||
# Matches: ## Titel ^block-id
|
||||
_BLOCK_ID_PATTERN = re.compile(r'\^([a-zA-Z0-9_-]+)\s*$')
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
|
||||
text = _WS.sub(' ', text.strip())
|
||||
|
|
@ -27,12 +39,18 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
|
||||
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
|
||||
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
|
||||
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
|
||||
"""
|
||||
blocks = []
|
||||
h1_title = "Dokument"
|
||||
section_path = "/"
|
||||
current_section_title = None
|
||||
|
||||
# WP-26 v1.0: State-Machine für Section-Type-Tracking
|
||||
current_section_type: Optional[str] = None # Aktueller Section-Type (oder None für note_type Fallback)
|
||||
section_introduced_at_level: Optional[int] = None # Ebene, auf der erste Section eingeführt wurde
|
||||
current_block_id: Optional[str] = None # Block-ID der aktuellen Sektion
|
||||
|
||||
# Frontmatter entfernen
|
||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||
|
||||
|
|
@ -70,8 +88,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
buffer = []
|
||||
|
||||
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
|
||||
# Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >)
|
||||
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
|
||||
# WP-26 v1.0: Erweitert um [!section]-Callouts
|
||||
# Regex für Callouts: >\s*[!edge], >\s*[!abstract], >\s*[!section] (auch mit mehreren >)
|
||||
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract|section)\]', re.IGNORECASE)
|
||||
|
||||
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
|
||||
processed_indices = set()
|
||||
|
|
@ -86,13 +105,39 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
# Prüfe, ob diese Zeile ein Callout startet
|
||||
callout_match = callout_pattern.match(line)
|
||||
if callout_match:
|
||||
callout_type = callout_match.group(1).lower() # "edge", "abstract", oder "section"
|
||||
|
||||
# WP-26 v1.0: [!section] Callout-Behandlung
|
||||
if callout_type == "section":
|
||||
# Extrahiere Section-Type aus dem Callout
|
||||
section_match = _SECTION_CALLOUT_PATTERN.match(line)
|
||||
if section_match:
|
||||
new_section_type = section_match.group(1).lower()
|
||||
current_section_type = new_section_type
|
||||
|
||||
# Tracke die Ebene, auf der die erste Section eingeführt wurde
|
||||
# Wir nehmen die Ebene der letzten Überschrift (section_path basiert)
|
||||
if section_introduced_at_level is None:
|
||||
# Bestimme Ebene aus section_path
|
||||
# "/" = H1, "/Title" = H2, "/Title/Sub" = H3, etc.
|
||||
path_depth = section_path.count('/') if section_path else 1
|
||||
section_introduced_at_level = max(1, path_depth + 1)
|
||||
|
||||
logger.debug(f"WP-26: Section-Type erkannt: '{new_section_type}' bei '{current_section_title}' (Level: {section_introduced_at_level})")
|
||||
|
||||
# [!section] Callout wird nicht als Block hinzugefügt (ist nur Metadaten)
|
||||
processed_indices.add(i)
|
||||
continue
|
||||
|
||||
# Vorherigen Text-Block abschließen
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
exclude_from_chunking=in_exclusion_zone,
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
buffer = []
|
||||
|
||||
|
|
@ -120,7 +165,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
blocks.append(RawBlock(
|
||||
"callout", callout_content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
|
||||
is_meta_content=True # WP-24c v4.2.6: Markierung für spätere Entfernung
|
||||
is_meta_content=True, # WP-24c v4.2.6: Markierung für spätere Entfernung
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
continue
|
||||
|
||||
|
|
@ -133,13 +180,32 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
exclude_from_chunking=in_exclusion_zone,
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
buffer = []
|
||||
|
||||
level = len(heading_match.group(1))
|
||||
title = heading_match.group(2).strip()
|
||||
|
||||
# WP-26 v1.0: Block-ID aus Überschrift extrahieren (z.B. "## Titel ^block-id")
|
||||
block_id_match = _BLOCK_ID_PATTERN.search(title)
|
||||
if block_id_match:
|
||||
current_block_id = block_id_match.group(1)
|
||||
# Entferne Block-ID aus dem Titel für saubere Anzeige
|
||||
title = _BLOCK_ID_PATTERN.sub('', title).strip()
|
||||
else:
|
||||
current_block_id = None
|
||||
|
||||
# WP-26 v1.0: Section-Type State-Machine
|
||||
# Wenn eine Section eingeführt wurde und wir auf gleicher oder höherer Ebene sind:
|
||||
# -> Automatisch neue Section erkennen (FA-02b)
|
||||
if section_introduced_at_level is not None and level <= section_introduced_at_level:
|
||||
# Neue Überschrift auf gleicher oder höherer Ebene -> Reset auf None (note_type Fallback)
|
||||
current_section_type = None
|
||||
logger.debug(f"WP-26: Neue Section erkannt bei H{level} '{title}' -> Reset auf note_type")
|
||||
|
||||
# WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
|
||||
is_llm_validation_zone = (
|
||||
level == llm_validation_level and
|
||||
|
|
@ -170,7 +236,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
# Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
|
||||
blocks.append(RawBlock(
|
||||
"heading", stripped, level, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
exclude_from_chunking=in_exclusion_zone,
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
continue
|
||||
|
||||
|
|
@ -181,13 +249,17 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
exclude_from_chunking=in_exclusion_zone,
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
buffer = []
|
||||
if stripped == "---":
|
||||
blocks.append(RawBlock(
|
||||
"separator", "---", None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
exclude_from_chunking=in_exclusion_zone,
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
else:
|
||||
buffer.append(line)
|
||||
|
|
@ -197,7 +269,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
exclude_from_chunking=in_exclusion_zone,
|
||||
section_type=current_section_type,
|
||||
block_id=current_block_id
|
||||
))
|
||||
|
||||
return blocks, h1_title
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
|
|||
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
|
||||
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
|
||||
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
||||
WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
|
||||
"""
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .chunking_models import RawBlock, Chunk
|
||||
|
|
@ -36,41 +37,70 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
|
||||
chunks: List[Chunk] = []
|
||||
|
||||
def _emit(txt, title, path):
|
||||
"""Schreibt den finalen Chunk ohne Text-Modifikationen."""
|
||||
def _emit(txt, title, path, section_type=None, block_id=None):
|
||||
"""
|
||||
Schreibt den finalen Chunk ohne Text-Modifikationen.
|
||||
WP-26 v1.0: Erweitert um section_type und block_id.
|
||||
"""
|
||||
idx = len(chunks)
|
||||
win = _create_win(context_prefix, title, txt)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None
|
||||
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None,
|
||||
section_type=section_type, block_id=block_id
|
||||
))
|
||||
|
||||
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
||||
# WP-26 v1.0: Erweitert um section_type und block_id Tracking
|
||||
sections: List[Dict[str, Any]] = []
|
||||
curr_blocks = []
|
||||
for b in blocks:
|
||||
if b.kind == "heading" and b.level <= split_level:
|
||||
if curr_blocks:
|
||||
# WP-26 v1.0: Finde den effektiven section_type und block_id für diese Sektion
|
||||
# Priorisiere den ersten Block mit section_type, sonst den Heading-Block
|
||||
effective_section_type = None
|
||||
effective_block_id = None
|
||||
for cb in curr_blocks:
|
||||
if cb.section_type and effective_section_type is None:
|
||||
effective_section_type = cb.section_type
|
||||
if cb.block_id and effective_block_id is None:
|
||||
effective_block_id = cb.block_id
|
||||
|
||||
sections.append({
|
||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||
"meta": curr_blocks[0],
|
||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
|
||||
"section_type": effective_section_type,
|
||||
"block_id": effective_block_id
|
||||
})
|
||||
curr_blocks = [b]
|
||||
else:
|
||||
curr_blocks.append(b)
|
||||
if curr_blocks:
|
||||
# WP-26 v1.0: Gleiche Logik für den letzten Block
|
||||
effective_section_type = None
|
||||
effective_block_id = None
|
||||
for cb in curr_blocks:
|
||||
if cb.section_type and effective_section_type is None:
|
||||
effective_section_type = cb.section_type
|
||||
if cb.block_id and effective_block_id is None:
|
||||
effective_block_id = cb.block_id
|
||||
|
||||
sections.append({
|
||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||
"meta": curr_blocks[0],
|
||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
|
||||
"section_type": effective_section_type,
|
||||
"block_id": effective_block_id
|
||||
})
|
||||
|
||||
# --- SCHRITT 2: Verarbeitung der Queue ---
|
||||
queue = list(sections)
|
||||
current_chunk_text = ""
|
||||
current_meta = {"title": None, "path": "/"}
|
||||
# WP-26 v1.0: Erweitert um section_type und block_id
|
||||
current_meta = {"title": None, "path": "/", "section_type": None, "block_id": None}
|
||||
|
||||
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
|
||||
is_hard_split_mode = (not smart_edge) or (strict)
|
||||
|
|
@ -83,6 +113,9 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
if not current_chunk_text:
|
||||
current_meta["title"] = item["meta"].section_title
|
||||
current_meta["path"] = item["meta"].section_path
|
||||
# WP-26 v1.0: section_type und block_id aus Item übernehmen
|
||||
current_meta["section_type"] = item.get("section_type")
|
||||
current_meta["block_id"] = item.get("block_id")
|
||||
|
||||
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
|
||||
if is_hard_split_mode:
|
||||
|
|
@ -90,18 +123,23 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften
|
||||
if current_chunk_text:
|
||||
# Flashe vorherigen Chunk
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
current_chunk_text = ""
|
||||
|
||||
# Neue Sektion: Initialisiere Meta
|
||||
current_meta["title"] = item["meta"].section_title
|
||||
current_meta["path"] = item["meta"].section_path
|
||||
# WP-26 v1.0: section_type und block_id aus Item übernehmen
|
||||
current_meta["section_type"] = item.get("section_type")
|
||||
current_meta["block_id"] = item.get("block_id")
|
||||
|
||||
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
|
||||
# (nur Überschrift, kein Inhalt)
|
||||
if item.get("is_empty", False):
|
||||
# Leere Sektion: Nur Überschrift als Chunk
|
||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
||||
_emit(item_text, current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
else:
|
||||
# Normale Sektion: Prüfe auf Token-Limit
|
||||
if estimate_tokens(item_text) > max_tokens:
|
||||
|
|
@ -113,16 +151,19 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
while sents:
|
||||
s = sents.pop(0); slen = estimate_tokens(s)
|
||||
if take_len + slen > target and take_sents:
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
take_sents = [s]; take_len = slen
|
||||
else:
|
||||
take_sents.append(s); take_len += slen
|
||||
|
||||
if take_sents:
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
else:
|
||||
# Sektion passt: Direkt als Chunk
|
||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
||||
_emit(item_text, current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
|
||||
current_chunk_text = ""
|
||||
continue
|
||||
|
|
@ -137,7 +178,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
else:
|
||||
if current_chunk_text:
|
||||
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
current_chunk_text = ""
|
||||
queue.insert(0, item)
|
||||
else:
|
||||
|
|
@ -152,7 +194,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
sents.insert(0, s); break
|
||||
take_sents.append(s); take_len += slen
|
||||
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
|
||||
if sents:
|
||||
remainder = " ".join(sents)
|
||||
|
|
@ -160,15 +203,21 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
if header_prefix and not remainder.startswith(header_prefix):
|
||||
remainder = header_prefix + "\n\n" + remainder
|
||||
# Carry-Over: Rest wird vorne in die Queue geschoben
|
||||
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True})
|
||||
# WP-26 v1.0: section_type und block_id weitergeben
|
||||
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True,
|
||||
"section_type": item.get("section_type"), "block_id": item.get("block_id")})
|
||||
|
||||
if current_chunk_text:
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||
current_meta["section_type"], current_meta["block_id"])
|
||||
|
||||
return chunks
|
||||
|
||||
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus."""
|
||||
"""
|
||||
Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
|
||||
WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
|
||||
"""
|
||||
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
||||
|
||||
|
|
@ -178,13 +227,31 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
|||
if curr_tokens + b_tokens > max_tokens and buf:
|
||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=curr_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||
# WP-26 v1.0: Finde effektiven section_type und block_id
|
||||
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
|
||||
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=curr_tokens,
|
||||
section_title=buf[0].section_title, section_path=buf[0].section_path,
|
||||
neighbors_prev=None, neighbors_next=None,
|
||||
section_type=effective_section_type, block_id=effective_block_id
|
||||
))
|
||||
buf = []
|
||||
buf.append(b)
|
||||
|
||||
if buf:
|
||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||
# WP-26 v1.0: Finde effektiven section_type und block_id
|
||||
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
|
||||
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=buf[0].section_title, section_path=buf[0].section_path,
|
||||
neighbors_prev=None, neighbors_next=None,
|
||||
section_type=effective_section_type, block_id=effective_block_id
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
|
@ -12,28 +12,85 @@ STATUS: Active
|
|||
import os
|
||||
import uuid
|
||||
import hashlib
|
||||
from typing import Iterable, List, Optional, Set, Any, Tuple
|
||||
from typing import Dict, Iterable, List, Optional, Set, Any, Tuple
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft
|
||||
# WP-26 v1.0: Provenance-Literale auf valide EdgeDTO-Werte reduziert
|
||||
# Legacy-Prioritäten für interne Verarbeitung (werden zu source_hint gemappt)
|
||||
PROVENANCE_PRIORITY = {
|
||||
# Explizite Kanten (provenance: "explicit")
|
||||
"explicit:wikilink": 1.00,
|
||||
"inline:rel": 0.95,
|
||||
"callout:edge": 0.90,
|
||||
"explicit:callout": 0.90, # WP-24c v4.2.7: Callout-Kanten aus candidate_pool
|
||||
"semantic_ai": 0.90, # Validierte KI-Kanten
|
||||
"structure:belongs_to": 1.00,
|
||||
"structure:order": 0.95, # next/prev
|
||||
"explicit:callout": 0.90,
|
||||
"explicit:note_scope": 1.00,
|
||||
"explicit:note_zone": 1.00, # WP-24c v4.2.0: Note-Scope Zonen (höchste Priorität)
|
||||
"explicit:note_zone": 1.00,
|
||||
# Regel-basierte Kanten (provenance: "rule")
|
||||
"derived:backlink": 0.90,
|
||||
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
||||
"edge_defaults": 0.70,
|
||||
"schema_default": 0.85,
|
||||
# Struktur-Kanten (provenance: "structure")
|
||||
"structure:belongs_to": 1.00,
|
||||
"structure:order": 0.95,
|
||||
# KI-generierte Kanten (provenance: "smart")
|
||||
"semantic_ai": 0.90,
|
||||
"global_pool": 0.80,
|
||||
}
|
||||
|
||||
# WP-26 v1.0: Mapping von internen Provenance-Werten zu EdgeDTO-konformen Literalen
|
||||
PROVENANCE_TO_DTO = {
|
||||
# explicit
|
||||
"explicit:wikilink": ("explicit", "wikilink"),
|
||||
"explicit:callout": ("explicit", "callout"),
|
||||
"explicit:note_scope": ("explicit", "note_scope"),
|
||||
"explicit:note_zone": ("explicit", "note_zone"),
|
||||
"inline:rel": ("explicit", "inline_rel"),
|
||||
"callout:edge": ("explicit", "callout"),
|
||||
"explicit": ("explicit", None),
|
||||
# rule
|
||||
"derived:backlink": ("rule", "backlink"),
|
||||
"edge_defaults": ("rule", "edge_defaults"),
|
||||
"schema_default": ("rule", "schema_default"),
|
||||
"inferred:schema": ("rule", "schema_default"),
|
||||
"rule": ("rule", None),
|
||||
# structure
|
||||
"structure:belongs_to": ("structure", "belongs_to"),
|
||||
"structure:order": ("structure", "order"),
|
||||
"structure": ("structure", None),
|
||||
# smart
|
||||
"semantic_ai": ("smart", None),
|
||||
"global_pool": ("smart", "global_pool"),
|
||||
"smart": ("smart", None),
|
||||
}
|
||||
|
||||
def normalize_provenance(internal_provenance: str) -> Tuple[str, Optional[str]]:
|
||||
"""
|
||||
WP-26 v1.0: Normalisiert interne Provenance-Werte zu EdgeDTO-konformen Literalen.
|
||||
|
||||
Args:
|
||||
internal_provenance: Interner Provenance-String (z.B. "explicit:callout")
|
||||
|
||||
Returns:
|
||||
Tuple (provenance, source_hint) mit validen EdgeDTO-Werten
|
||||
"""
|
||||
if internal_provenance in PROVENANCE_TO_DTO:
|
||||
return PROVENANCE_TO_DTO[internal_provenance]
|
||||
|
||||
# Fallback: Versuche Präfix-Matching
|
||||
if internal_provenance.startswith("explicit"):
|
||||
return ("explicit", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
|
||||
if internal_provenance.startswith("structure"):
|
||||
return ("structure", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
|
||||
if internal_provenance.startswith("rule") or internal_provenance.startswith("derived"):
|
||||
return ("rule", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
|
||||
|
||||
# Default: explicit ohne source_hint
|
||||
return ("explicit", None)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -123,7 +180,15 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[
|
|||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||
"""
|
||||
Konstruiert ein standardisiertes Kanten-Payload für Qdrant.
|
||||
Wird von graph_derive_edges.py benötigt.
|
||||
WP-26 v1.0: Erweitert um is_internal Flag und Provenance-Normalisierung.
|
||||
|
||||
Args:
|
||||
kind: Kantentyp (z.B. "derives", "caused_by")
|
||||
scope: Granularität ("chunk" oder "note")
|
||||
source_id: ID der Quelle (Chunk oder Note)
|
||||
target_id: ID des Ziels (Chunk oder Note)
|
||||
note_id: ID der Note (für Kontext)
|
||||
extra: Zusätzliche Payload-Felder
|
||||
"""
|
||||
pl = {
|
||||
"kind": kind,
|
||||
|
|
@ -134,8 +199,24 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
|
|||
"note_id": note_id,
|
||||
"virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt
|
||||
}
|
||||
|
||||
# WP-26 v1.0: is_internal Flag berechnen
|
||||
# Intra-Note-Edge: Source und Target gehören zur gleichen Note
|
||||
source_note = source_id.split("#")[0] if "#" in source_id else source_id
|
||||
target_note = target_id.split("#")[0] if "#" in target_id else target_id
|
||||
pl["is_internal"] = (source_note == target_note) or (source_note == note_id and target_note == note_id)
|
||||
|
||||
if extra:
|
||||
pl.update(extra)
|
||||
|
||||
# WP-26 v1.0: Provenance normalisieren, falls vorhanden
|
||||
if "provenance" in extra:
|
||||
internal_prov = extra["provenance"]
|
||||
dto_prov, source_hint = normalize_provenance(internal_prov)
|
||||
pl["provenance"] = dto_prov
|
||||
if source_hint:
|
||||
pl["source_hint"] = source_hint
|
||||
|
||||
return pl
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -3,7 +3,8 @@ FILE: app/core/ingestion/ingestion_chunk_payload.py
|
|||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
||||
WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution.
|
||||
VERSION: 2.4.4 (WP-24c v4.3.0)
|
||||
WP-26 v1.0: Erweiterung um effective_type (section_type || note_type) und note_type-Feld.
|
||||
VERSION: 2.5.0 (WP-26 v1.0)
|
||||
STATUS: Active
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
|
@ -91,14 +92,35 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
|
|||
section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "")
|
||||
# WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution
|
||||
candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", [])
|
||||
|
||||
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
|
||||
section_type = getattr(ch, "section_type", None) if not is_dict else ch.get("section_type")
|
||||
# WP-26 v1.0: Block-ID für Intra-Note-Links
|
||||
block_id = getattr(ch, "block_id", None) if not is_dict else ch.get("block_id")
|
||||
|
||||
# WP-26 v1.0: Effektiver Typ = section_type || note_type (FA-03)
|
||||
effective_type = section_type if section_type else note_type
|
||||
|
||||
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ (FA-09b)
|
||||
# Wenn section_type vorhanden, nutze dessen retriever_weight
|
||||
effective_rw = rw
|
||||
if section_type:
|
||||
effective_rw = _resolve_val(section_type, reg, "retriever_weight", rw)
|
||||
try:
|
||||
effective_rw = float(effective_rw)
|
||||
except:
|
||||
effective_rw = rw
|
||||
|
||||
pl: Dict[str, Any] = {
|
||||
"note_id": nid or fm.get("id"),
|
||||
"chunk_id": cid,
|
||||
"title": title,
|
||||
"index": int(index),
|
||||
"ord": int(index) + 1,
|
||||
"type": note_type,
|
||||
# WP-26 v1.0: type enthält den effektiven Typ (section_type || note_type)
|
||||
"type": effective_type,
|
||||
# WP-26 v1.0: note_type ist immer der ursprüngliche Note-Typ (für Filterung)
|
||||
"note_type": note_type,
|
||||
"tags": tags,
|
||||
"text": text,
|
||||
"window": window,
|
||||
|
|
@ -107,9 +129,13 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
|
|||
"section": section,
|
||||
"path": note_path,
|
||||
"source_path": kwargs.get("file_path") or note_path,
|
||||
"retriever_weight": rw,
|
||||
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ
|
||||
"retriever_weight": effective_rw,
|
||||
"chunk_profile": cp,
|
||||
"candidate_pool": candidate_pool # WP-24c v4.3.0: Kritisch für Chunk-Attribution
|
||||
"candidate_pool": candidate_pool, # WP-24c v4.3.0: Kritisch für Chunk-Attribution
|
||||
# WP-26 v1.0: Optionale Felder für Section-Type-Tracking
|
||||
"section_type": section_type, # Expliziter Section-Type (oder None)
|
||||
"block_id": block_id, # Block-ID für Intra-Note-Links (oder None)
|
||||
}
|
||||
|
||||
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)
|
||||
|
|
|
|||
|
|
@ -46,16 +46,18 @@ class EdgeDTO(BaseModel):
|
|||
target: str
|
||||
weight: float
|
||||
direction: Literal["out", "in", "undirected"] = "out"
|
||||
# WP-24c v4.5.3: Erweiterte Provenance-Werte für Chunk-Aware Edges
|
||||
# Unterstützt alle tatsächlich verwendeten Provenance-Typen im System
|
||||
provenance: Optional[Literal[
|
||||
"explicit", "rule", "smart", "structure",
|
||||
"explicit:callout", "explicit:wikilink", "explicit:note_zone", "explicit:note_scope",
|
||||
"inline:rel", "callout:edge", "semantic_ai", "structure:belongs_to", "structure:order",
|
||||
"derived:backlink", "edge_defaults", "global_pool"
|
||||
]] = "explicit"
|
||||
# WP-26 v1.0: Provenance auf valide Literale reduziert (EdgeDTO-Constraint)
|
||||
# Detail-Informationen werden über source_hint transportiert
|
||||
provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit"
|
||||
# WP-26 v1.0: Neues Feld für Detail-Informationen zur Herkunft
|
||||
source_hint: Optional[Literal[
|
||||
"callout", "wikilink", "inline_rel", "schema_default", "note_scope",
|
||||
"note_zone", "belongs_to", "order", "backlink", "edge_defaults", "global_pool"
|
||||
]] = None
|
||||
confidence: float = 1.0
|
||||
target_section: Optional[str] = None
|
||||
target_section: Optional[str] = None
|
||||
# WP-26 v1.0: Flag für Intra-Note-Edges
|
||||
is_internal: Optional[bool] = None
|
||||
|
||||
|
||||
# --- Request Models ---
|
||||
|
|
|
|||
284
docs/05_Development/05_WP26_Manual_Testing.md
Normal file
284
docs/05_Development/05_WP26_Manual_Testing.md
Normal file
|
|
@ -0,0 +1,284 @@
|
|||
# WP-26 Manuelle Testszenarien
|
||||
|
||||
**Version:** 1.0
|
||||
**Datum:** 25. Januar 2026
|
||||
**Status:** Phase 1 Implementierung abgeschlossen
|
||||
|
||||
---
|
||||
|
||||
## 1. Überblick
|
||||
|
||||
Dieses Dokument beschreibt die manuellen Testszenarien für WP-26 Phase 1: Section-Types und Intra-Note-Edges.
|
||||
|
||||
---
|
||||
|
||||
## 2. Voraussetzungen
|
||||
|
||||
1. **Python-Umgebung** mit allen Dependencies aus `requirements.txt`
|
||||
2. **Qdrant-Instanz** erreichbar (lokal oder Docker)
|
||||
3. **Vault mit Test-Note** (siehe Abschnitt 3)
|
||||
|
||||
---
|
||||
|
||||
## 3. Test-Note erstellen
|
||||
|
||||
Erstelle eine neue Markdown-Datei im Vault mit folgendem Inhalt:
|
||||
|
||||
```markdown
|
||||
---
|
||||
id: wp26-test-experience
|
||||
title: WP-26 Test Experience
|
||||
type: experience
|
||||
tags: [test, wp26]
|
||||
---
|
||||
|
||||
# WP-26 Test Experience
|
||||
|
||||
## Situation ^sit
|
||||
> [!section] experience
|
||||
|
||||
Am 25. Januar 2026 testete ich das neue Section-Type Feature.
|
||||
Dies ist der Experience-Teil der Note.
|
||||
|
||||
## Meine Reaktion ^react
|
||||
> [!section] experience
|
||||
|
||||
> [!edge] followed_by
|
||||
> [[#^sit]]
|
||||
|
||||
Ich war zunächst skeptisch, aber die Implementierung sah solide aus.
|
||||
|
||||
## Reflexion ^ref
|
||||
> [!section] insight
|
||||
|
||||
Diese Erfahrung zeigt mir, dass typ-spezifische Sektionen
|
||||
die semantische Präzision des Retrievals verbessern können.
|
||||
|
||||
> [!abstract] Semantic Edges
|
||||
>> [!edge] derives
|
||||
>> [[#^sit]]
|
||||
>> [[#^react]]
|
||||
|
||||
## Nächste Schritte ^next
|
||||
> [!section] decision
|
||||
|
||||
Ich werde:
|
||||
1. Die Tests ausführen
|
||||
2. Die Ergebnisse dokumentieren
|
||||
|
||||
> [!edge] caused_by
|
||||
> [[#^ref]]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Testszenarien
|
||||
|
||||
### 4.1 TS-01: Section-Type-Erkennung
|
||||
|
||||
**Ziel:** Prüfen, ob `[!section]`-Callouts korrekt erkannt werden.
|
||||
|
||||
**Schritte:**
|
||||
|
||||
1. Importiere die Test-Note via `scripts/import_markdown.py`
|
||||
2. Prüfe die Chunks in Qdrant via API oder Debug-Skript
|
||||
|
||||
**Prüfkriterien:**
|
||||
|
||||
| Chunk | Erwarteter `type` | Erwarteter `note_type` | Erwarteter `section` |
|
||||
|-------|-------------------|------------------------|----------------------|
|
||||
| #c00 | experience | experience | Situation |
|
||||
| #c01 | experience | experience | Meine Reaktion |
|
||||
| #c02 | insight | experience | Reflexion |
|
||||
| #c03 | decision | experience | Nächste Schritte |
|
||||
|
||||
**Prüf-Script:**
|
||||
|
||||
```python
|
||||
# scripts/check_wp26_chunks.py
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
client = QdrantClient("http://localhost:6333")
|
||||
note_id = "wp26-test-experience"
|
||||
|
||||
# Hole alle Chunks der Note
|
||||
result = client.scroll(
|
||||
collection_name="mindnet_chunks",
|
||||
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
|
||||
with_payload=True,
|
||||
limit=100
|
||||
)
|
||||
|
||||
for point in result[0]:
|
||||
p = point.payload
|
||||
print(f"Chunk: {p.get('chunk_id')}")
|
||||
print(f" type: {p.get('type')}")
|
||||
print(f" note_type: {p.get('note_type')}")
|
||||
print(f" section: {p.get('section')}")
|
||||
print(f" section_type: {p.get('section_type')}")
|
||||
print(f" block_id: {p.get('block_id')}")
|
||||
print()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.2 TS-02: Block-ID-Erkennung
|
||||
|
||||
**Ziel:** Prüfen, ob Block-IDs (`^id`) aus Überschriften korrekt extrahiert werden.
|
||||
|
||||
**Prüfkriterien:**
|
||||
|
||||
| Chunk | Erwartete `block_id` |
|
||||
|-------|---------------------|
|
||||
| #c00 | sit |
|
||||
| #c01 | react |
|
||||
| #c02 | ref |
|
||||
| #c03 | next |
|
||||
|
||||
---
|
||||
|
||||
### 4.3 TS-03: is_internal Flag für Edges
|
||||
|
||||
**Ziel:** Prüfen, ob Intra-Note-Edges das `is_internal: true` Flag erhalten.
|
||||
|
||||
**Schritte:**
|
||||
|
||||
1. Importiere die Test-Note
|
||||
2. Prüfe die Edges in Qdrant
|
||||
|
||||
**Prüfkriterien:**
|
||||
|
||||
| Edge | `is_internal` |
|
||||
|------|---------------|
|
||||
| #c01 → #c00 (followed_by) | `true` |
|
||||
| #c02 → #c00 (derives) | `true` |
|
||||
| #c02 → #c01 (derives) | `true` |
|
||||
| #c03 → #c02 (caused_by) | `true` |
|
||||
| Alle structure edges (next/prev) | `true` |
|
||||
|
||||
**Prüf-Script:**
|
||||
|
||||
```python
|
||||
# scripts/check_wp26_edges.py
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
client = QdrantClient("http://localhost:6333")
|
||||
note_id = "wp26-test-experience"
|
||||
|
||||
# Hole alle Edges der Note
|
||||
result = client.scroll(
|
||||
collection_name="mindnet_edges",
|
||||
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
|
||||
with_payload=True,
|
||||
limit=100
|
||||
)
|
||||
|
||||
for point in result[0]:
|
||||
p = point.payload
|
||||
kind = p.get('kind', 'unknown')
|
||||
source = p.get('source_id', '?')
|
||||
target = p.get('target_id', '?')
|
||||
is_internal = p.get('is_internal', 'MISSING')
|
||||
provenance = p.get('provenance', '?')
|
||||
source_hint = p.get('source_hint', '-')
|
||||
|
||||
print(f"{source} --[{kind}]--> {target}")
|
||||
print(f" is_internal: {is_internal}")
|
||||
print(f" provenance: {provenance}")
|
||||
print(f" source_hint: {source_hint}")
|
||||
print()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.4 TS-04: Provenance-Normalisierung
|
||||
|
||||
**Ziel:** Prüfen, ob Provenance-Werte korrekt normalisiert werden.
|
||||
|
||||
**Prüfkriterien:**
|
||||
|
||||
| Altes Provenance | Neues `provenance` | `source_hint` |
|
||||
|------------------|-------------------|---------------|
|
||||
| explicit:callout | explicit | callout |
|
||||
| explicit:wikilink | explicit | wikilink |
|
||||
| structure:belongs_to | structure | belongs_to |
|
||||
| structure:order | structure | order |
|
||||
| edge_defaults | rule | edge_defaults |
|
||||
|
||||
---
|
||||
|
||||
### 4.5 TS-05: Automatische Section-Erkennung
|
||||
|
||||
**Ziel:** Prüfen, ob neue Überschriften ohne `[!section]` automatisch neue Chunks erstellen.
|
||||
|
||||
**Test-Note:**
|
||||
|
||||
```markdown
|
||||
---
|
||||
id: wp26-test-auto-section
|
||||
type: experience
|
||||
---
|
||||
|
||||
# Test Auto Section
|
||||
|
||||
## Section A ^a
|
||||
> [!section] insight
|
||||
|
||||
Content A (insight).
|
||||
|
||||
## Section B ^b
|
||||
|
||||
Content B (sollte experience sein - Fallback).
|
||||
|
||||
## Section C ^c
|
||||
> [!section] decision
|
||||
|
||||
Content C (decision).
|
||||
```
|
||||
|
||||
**Prüfkriterien:**
|
||||
|
||||
| Chunk | `type` | Grund |
|
||||
|-------|--------|-------|
|
||||
| Section A | insight | Explizites `[!section]` |
|
||||
| Section B | experience | Fallback auf `note_type` |
|
||||
| Section C | decision | Explizites `[!section]` |
|
||||
|
||||
---
|
||||
|
||||
## 5. Unit-Tests ausführen
|
||||
|
||||
```bash
|
||||
# Im Projekt-Root
|
||||
cd c:\Dev\cursor\mindnet
|
||||
|
||||
# Aktiviere virtuelle Umgebung (falls vorhanden)
|
||||
# .venv\Scripts\activate
|
||||
|
||||
# Führe WP-26 Tests aus
|
||||
python -m pytest tests/test_wp26_section_types.py -v
|
||||
```
|
||||
|
||||
**Erwartetes Ergebnis:** Alle Tests grün.
|
||||
|
||||
---
|
||||
|
||||
## 6. Bekannte Einschränkungen
|
||||
|
||||
1. **Block-ID-Stability:** Obsidian aktualisiert Block-IDs nicht automatisch bei Umbenennung von Überschriften.
|
||||
2. **Heading-Links:** Links wie `[[#Section Name]]` werden unterstützt, aber Block-References (`[[#^id]]`) werden bevorzugt.
|
||||
3. **Nested Callouts:** Verschachtelte Callouts (`>> [!edge]`) werden korrekt verarbeitet.
|
||||
|
||||
---
|
||||
|
||||
## 7. Nächste Schritte (Phase 2)
|
||||
|
||||
Nach erfolgreicher Validierung von Phase 1:
|
||||
|
||||
1. **Retriever-Anpassung:** Path-Bonus für Intra-Note-Edges
|
||||
2. **Graph-Exploration:** Navigation entlang `typical edges` aus `graph_schema.md`
|
||||
3. **Schema-Validierung:** Agentic Validation gegen effektive Chunk-Typen
|
||||
|
||||
---
|
||||
|
||||
**Ende der Testdokumentation**
|
||||
1470
docs/06_Roadmap/06_LH_Section_Types_Intra_Note_Edges.md
Normal file
1470
docs/06_Roadmap/06_LH_Section_Types_Intra_Note_Edges.md
Normal file
File diff suppressed because it is too large
Load Diff
265
tests/test_wp26_section_types.py
Normal file
265
tests/test_wp26_section_types.py
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
"""
|
||||
FILE: tests/test_wp26_section_types.py
|
||||
DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
|
||||
VERSION: 1.0.0
|
||||
"""
|
||||
import pytest
|
||||
from app.core.chunking.chunking_parser import parse_blocks
|
||||
from app.core.chunking.chunking_models import RawBlock, Chunk
|
||||
from app.core.graph.graph_utils import normalize_provenance, _edge
|
||||
|
||||
|
||||
class TestSectionTypeRecognition:
|
||||
"""UT-01: Parser – Section-Type-Erkennung"""
|
||||
|
||||
def test_section_type_recognition(self):
|
||||
"""Testet, ob [!section]-Callouts korrekt erkannt werden."""
|
||||
md = """
|
||||
## Reflexion ^ref
|
||||
> [!section] insight
|
||||
|
||||
Content here about insights.
|
||||
"""
|
||||
blocks, _ = parse_blocks(md)
|
||||
|
||||
# Finde den Paragraph-Block nach dem Section-Callout
|
||||
paragraph_blocks = [b for b in blocks if b.kind == "paragraph"]
|
||||
assert len(paragraph_blocks) >= 1
|
||||
|
||||
# Der Paragraph-Block sollte section_type "insight" haben
|
||||
assert paragraph_blocks[0].section_type == "insight"
|
||||
|
||||
def test_section_type_with_block_id(self):
|
||||
"""Testet, ob Block-IDs in Überschriften korrekt extrahiert werden."""
|
||||
md = """
|
||||
## Situation ^sit
|
||||
> [!section] experience
|
||||
|
||||
Die Geschichte beginnt hier.
|
||||
"""
|
||||
blocks, _ = parse_blocks(md)
|
||||
|
||||
# Finde den Heading-Block
|
||||
heading_blocks = [b for b in blocks if b.kind == "heading"]
|
||||
assert len(heading_blocks) >= 1
|
||||
|
||||
# Block-ID sollte "sit" sein
|
||||
assert heading_blocks[0].block_id == "sit"
|
||||
|
||||
|
||||
class TestSectionTypeScope:
|
||||
"""UT-02: Parser – Scope-Beendigung"""
|
||||
|
||||
def test_section_type_scope_ends_at_same_level_heading(self):
|
||||
"""Section-Type endet bei nächster H2."""
|
||||
md = """
|
||||
## Section A
|
||||
> [!section] insight
|
||||
|
||||
Content A with insight.
|
||||
|
||||
## Section B
|
||||
|
||||
Content B without section callout.
|
||||
"""
|
||||
blocks, _ = parse_blocks(md)
|
||||
|
||||
# Finde Paragraph-Blöcke
|
||||
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||
|
||||
# Erster Paragraph hat section_type "insight"
|
||||
assert paragraphs[0].section_type == "insight"
|
||||
|
||||
# Zweiter Paragraph hat section_type None (Reset)
|
||||
assert paragraphs[1].section_type is None
|
||||
|
||||
|
||||
class TestProvenanceNormalization:
|
||||
"""UT für Provenance-Normalisierung (WP-26 v1.0)"""
|
||||
|
||||
def test_normalize_explicit_callout(self):
|
||||
"""explicit:callout -> (explicit, callout)"""
|
||||
prov, hint = normalize_provenance("explicit:callout")
|
||||
assert prov == "explicit"
|
||||
assert hint == "callout"
|
||||
|
||||
def test_normalize_explicit_wikilink(self):
|
||||
"""explicit:wikilink -> (explicit, wikilink)"""
|
||||
prov, hint = normalize_provenance("explicit:wikilink")
|
||||
assert prov == "explicit"
|
||||
assert hint == "wikilink"
|
||||
|
||||
def test_normalize_structure_belongs_to(self):
|
||||
"""structure:belongs_to -> (structure, belongs_to)"""
|
||||
prov, hint = normalize_provenance("structure:belongs_to")
|
||||
assert prov == "structure"
|
||||
assert hint == "belongs_to"
|
||||
|
||||
def test_normalize_schema_default(self):
|
||||
"""inferred:schema -> (rule, schema_default)"""
|
||||
prov, hint = normalize_provenance("inferred:schema")
|
||||
assert prov == "rule"
|
||||
assert hint == "schema_default"
|
||||
|
||||
def test_normalize_unknown_fallback(self):
|
||||
"""Unbekannte Provenance -> (explicit, None)"""
|
||||
prov, hint = normalize_provenance("unknown_provenance")
|
||||
assert prov == "explicit"
|
||||
assert hint is None
|
||||
|
||||
|
||||
class TestIsInternalFlag:
|
||||
"""UT-13: is_internal Flag für Intra-Note-Edges"""
|
||||
|
||||
def test_is_internal_true_for_same_note(self):
|
||||
"""Edges zwischen Chunks derselben Note haben is_internal=True"""
|
||||
edge = _edge(
|
||||
kind="derives",
|
||||
scope="chunk",
|
||||
source_id="note1#c01",
|
||||
target_id="note1#c02",
|
||||
note_id="note1"
|
||||
)
|
||||
assert edge["is_internal"] is True
|
||||
|
||||
def test_is_internal_false_for_different_notes(self):
|
||||
"""Edges zwischen verschiedenen Notes haben is_internal=False"""
|
||||
edge = _edge(
|
||||
kind="references",
|
||||
scope="chunk",
|
||||
source_id="note1#c01",
|
||||
target_id="note2#c01",
|
||||
note_id="note1"
|
||||
)
|
||||
assert edge["is_internal"] is False
|
||||
|
||||
def test_is_internal_true_for_note_to_chunk(self):
|
||||
"""Edges von Note zu eigenem Chunk haben is_internal=True"""
|
||||
edge = _edge(
|
||||
kind="belongs_to",
|
||||
scope="chunk",
|
||||
source_id="note1#c01",
|
||||
target_id="note1",
|
||||
note_id="note1"
|
||||
)
|
||||
assert edge["is_internal"] is True
|
||||
|
||||
|
||||
class TestEdgeProvenanceInPayload:
|
||||
"""Test für Provenance-Normalisierung in Edge-Payloads"""
|
||||
|
||||
def test_edge_provenance_normalized(self):
|
||||
"""Provenance wird in Edge-Payloads normalisiert"""
|
||||
edge = _edge(
|
||||
kind="derives",
|
||||
scope="chunk",
|
||||
source_id="note1#c01",
|
||||
target_id="note1#c02",
|
||||
note_id="note1",
|
||||
extra={"provenance": "explicit:callout"}
|
||||
)
|
||||
|
||||
assert edge["provenance"] == "explicit"
|
||||
assert edge["source_hint"] == "callout"
|
||||
|
||||
|
||||
class TestAutomaticSectionRecognition:
|
||||
"""UT-09: Automatische Section-Erkennung bei neuen Überschriften"""
|
||||
|
||||
def test_automatic_section_recognition_at_same_heading_level(self):
|
||||
"""Neue Überschriften auf gleicher Ebene starten automatisch neue Sections"""
|
||||
md = """
|
||||
## Situation ^sit
|
||||
> [!section] experience
|
||||
|
||||
Content A.
|
||||
|
||||
## Reflexion ^ref
|
||||
|
||||
Content B.
|
||||
|
||||
## Learnings ^learn
|
||||
> [!section] insight
|
||||
|
||||
Content C.
|
||||
|
||||
## Ausblick ^out
|
||||
|
||||
Content D.
|
||||
"""
|
||||
blocks, _ = parse_blocks(md)
|
||||
|
||||
# Sammle alle Paragraph-Blöcke in Reihenfolge
|
||||
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||
|
||||
assert len(paragraphs) == 4
|
||||
|
||||
# Chunk 1: Expliziter section_type "experience"
|
||||
assert paragraphs[0].section_type == "experience"
|
||||
|
||||
# Chunk 2: Neue Section ohne Callout → None (Fallback auf note_type)
|
||||
assert paragraphs[1].section_type is None
|
||||
|
||||
# Chunk 3: Expliziter section_type "insight"
|
||||
assert paragraphs[2].section_type == "insight"
|
||||
|
||||
# Chunk 4: Neue Section ohne Callout → None (Fallback auf note_type)
|
||||
assert paragraphs[3].section_type is None
|
||||
|
||||
|
||||
class TestSeparateSectionCallout:
|
||||
"""UT-10: Separates Section-Callout an beliebiger Stelle"""
|
||||
|
||||
def test_section_callout_separate_from_edge_callout(self):
|
||||
"""Section-Callout kann separat von Edge-Callouts stehen"""
|
||||
md = """
|
||||
## Reflexion ^ref
|
||||
|
||||
Einleitender Text hier...
|
||||
|
||||
> [!section] insight
|
||||
|
||||
Weiterer normaler Inhalt...
|
||||
|
||||
> [!edge] derives
|
||||
> [[#^sit]]
|
||||
"""
|
||||
blocks, _ = parse_blocks(md)
|
||||
|
||||
# Finde Paragraph-Blöcke nach dem Section-Callout
|
||||
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||
|
||||
# Es sollten mindestens 2 Paragraphen geben
|
||||
assert len(paragraphs) >= 2
|
||||
|
||||
# Der erste Paragraph hat noch keinen section_type (vor dem Callout)
|
||||
# Der zweite Paragraph hat section_type "insight"
|
||||
# Hinweis: Die genaue Zuordnung hängt von der Parser-Implementierung ab
|
||||
section_types = [p.section_type for p in paragraphs]
|
||||
assert "insight" in section_types
|
||||
|
||||
|
||||
class TestNestedEdgeCallouts:
|
||||
"""UT-08: Verschachtelte Edge-Callouts in Container"""
|
||||
|
||||
def test_nested_callouts_recognized(self):
|
||||
"""Verschachtelte Callouts werden als Callout-Blöcke erkannt"""
|
||||
md = """
|
||||
> [!abstract] Semantic Edges
|
||||
>> [!edge] derived_from
|
||||
>> [[Target1#Section]]
|
||||
>
|
||||
>> [!edge] solves
|
||||
>> [[Target2]]
|
||||
"""
|
||||
blocks, _ = parse_blocks(md)
|
||||
|
||||
# Finde Callout-Blöcke
|
||||
callouts = [b for b in blocks if b.kind == "callout"]
|
||||
|
||||
# Es sollte mindestens ein Callout-Block erkannt werden
|
||||
assert len(callouts) >= 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Loading…
Reference in New Issue
Block a user