Refactor provenance handling in EdgeDTO and graph utilities
- Updated provenance priorities and introduced a mapping from internal provenance values to EdgeDTO-compliant literals. - Added a new function `normalize_provenance` to standardize internal provenance strings. - Enhanced the `_edge` function to include an `is_internal` flag and provenance normalization. - Modified the `EdgeDTO` model to include a new `source_hint` field for detailed provenance information and an `is_internal` flag for intra-note edges. - Reduced the provenance options in `EdgeDTO` to valid literals, improving data integrity.
This commit is contained in:
parent
0d61a9e191
commit
cc258008dc
|
|
@ -1,13 +1,17 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_models.py
|
FILE: app/core/chunking/chunking_models.py
|
||||||
DESCRIPTION: Datenklassen für das Chunking-System.
|
DESCRIPTION: Datenklassen für das Chunking-System.
|
||||||
|
WP-26 v1.0: Erweiterung um section_type für typ-spezifische Sektionen.
|
||||||
"""
|
"""
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Dict, Optional, Any
|
from typing import List, Dict, Optional, Any
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RawBlock:
|
class RawBlock:
|
||||||
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing."""
|
"""
|
||||||
|
Repräsentiert einen logischen Block aus dem Markdown-Parsing.
|
||||||
|
WP-26 v1.0: Erweitert um section_type für typ-spezifische Sektionen.
|
||||||
|
"""
|
||||||
kind: str
|
kind: str
|
||||||
text: str
|
text: str
|
||||||
level: Optional[int]
|
level: Optional[int]
|
||||||
|
|
@ -15,10 +19,17 @@ class RawBlock:
|
||||||
section_title: Optional[str]
|
section_title: Optional[str]
|
||||||
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
|
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
|
||||||
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
|
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
|
||||||
|
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
|
||||||
|
section_type: Optional[str] = None # z.B. "insight", "decision", "experience"
|
||||||
|
# WP-26 v1.0: Block-ID für Intra-Note-Links (z.B. "^sit" aus "## Situation ^sit")
|
||||||
|
block_id: Optional[str] = None
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Chunk:
|
class Chunk:
|
||||||
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung."""
|
"""
|
||||||
|
Das finale Chunk-Objekt für Embedding und Graph-Speicherung.
|
||||||
|
WP-26 v1.0: Erweitert um section_type für effektiven Typ.
|
||||||
|
"""
|
||||||
id: str
|
id: str
|
||||||
note_id: str
|
note_id: str
|
||||||
index: int
|
index: int
|
||||||
|
|
@ -31,3 +42,8 @@ class Chunk:
|
||||||
neighbors_next: Optional[str]
|
neighbors_next: Optional[str]
|
||||||
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
||||||
suggested_edges: Optional[List[str]] = None
|
suggested_edges: Optional[List[str]] = None
|
||||||
|
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
|
||||||
|
# Wenn gesetzt, wird dieser als "effektiver Typ" verwendet statt note_type
|
||||||
|
section_type: Optional[str] = None
|
||||||
|
# WP-26 v1.0: Block-ID für Intra-Note-Links
|
||||||
|
block_id: Optional[str] = None
|
||||||
|
|
@ -5,16 +5,28 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
||||||
Stellt die Funktion parse_edges_robust zur Verfügung.
|
Stellt die Funktion parse_edges_robust zur Verfügung.
|
||||||
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
|
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
|
||||||
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
|
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
|
||||||
|
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
from typing import List, Tuple, Set, Dict, Any, Optional
|
from typing import List, Tuple, Set, Dict, Any, Optional
|
||||||
from .chunking_models import RawBlock
|
from .chunking_models import RawBlock
|
||||||
from .chunking_utils import extract_frontmatter_from_text
|
from .chunking_utils import extract_frontmatter_from_text
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_WS = re.compile(r'\s+')
|
_WS = re.compile(r'\s+')
|
||||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||||
|
|
||||||
|
# WP-26 v1.0: Pattern für [!section]-Callouts
|
||||||
|
# Matches: > [!section] type-name
|
||||||
|
_SECTION_CALLOUT_PATTERN = re.compile(r'^\s*>\s*\[!section\]\s*(\w+)', re.IGNORECASE)
|
||||||
|
|
||||||
|
# WP-26 v1.0: Pattern für Block-IDs in Überschriften
|
||||||
|
# Matches: ## Titel ^block-id
|
||||||
|
_BLOCK_ID_PATTERN = re.compile(r'\^([a-zA-Z0-9_-]+)\s*$')
|
||||||
|
|
||||||
def split_sentences(text: str) -> list[str]:
|
def split_sentences(text: str) -> list[str]:
|
||||||
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
|
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
|
||||||
text = _WS.sub(' ', text.strip())
|
text = _WS.sub(' ', text.strip())
|
||||||
|
|
@ -27,12 +39,18 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
|
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
|
||||||
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
|
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
|
||||||
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
|
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
|
||||||
|
WP-26 v1.0: Section-Type-Erkennung via [!section]-Callouts und automatische Section-Erkennung.
|
||||||
"""
|
"""
|
||||||
blocks = []
|
blocks = []
|
||||||
h1_title = "Dokument"
|
h1_title = "Dokument"
|
||||||
section_path = "/"
|
section_path = "/"
|
||||||
current_section_title = None
|
current_section_title = None
|
||||||
|
|
||||||
|
# WP-26 v1.0: State-Machine für Section-Type-Tracking
|
||||||
|
current_section_type: Optional[str] = None # Aktueller Section-Type (oder None für note_type Fallback)
|
||||||
|
section_introduced_at_level: Optional[int] = None # Ebene, auf der erste Section eingeführt wurde
|
||||||
|
current_block_id: Optional[str] = None # Block-ID der aktuellen Sektion
|
||||||
|
|
||||||
# Frontmatter entfernen
|
# Frontmatter entfernen
|
||||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||||
|
|
||||||
|
|
@ -70,8 +88,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
buffer = []
|
buffer = []
|
||||||
|
|
||||||
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
|
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
|
||||||
# Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >)
|
# WP-26 v1.0: Erweitert um [!section]-Callouts
|
||||||
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
|
# Regex für Callouts: >\s*[!edge], >\s*[!abstract], >\s*[!section] (auch mit mehreren >)
|
||||||
|
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract|section)\]', re.IGNORECASE)
|
||||||
|
|
||||||
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
|
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
|
||||||
processed_indices = set()
|
processed_indices = set()
|
||||||
|
|
@ -86,13 +105,39 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
# Prüfe, ob diese Zeile ein Callout startet
|
# Prüfe, ob diese Zeile ein Callout startet
|
||||||
callout_match = callout_pattern.match(line)
|
callout_match = callout_pattern.match(line)
|
||||||
if callout_match:
|
if callout_match:
|
||||||
|
callout_type = callout_match.group(1).lower() # "edge", "abstract", oder "section"
|
||||||
|
|
||||||
|
# WP-26 v1.0: [!section] Callout-Behandlung
|
||||||
|
if callout_type == "section":
|
||||||
|
# Extrahiere Section-Type aus dem Callout
|
||||||
|
section_match = _SECTION_CALLOUT_PATTERN.match(line)
|
||||||
|
if section_match:
|
||||||
|
new_section_type = section_match.group(1).lower()
|
||||||
|
current_section_type = new_section_type
|
||||||
|
|
||||||
|
# Tracke die Ebene, auf der die erste Section eingeführt wurde
|
||||||
|
# Wir nehmen die Ebene der letzten Überschrift (section_path basiert)
|
||||||
|
if section_introduced_at_level is None:
|
||||||
|
# Bestimme Ebene aus section_path
|
||||||
|
# "/" = H1, "/Title" = H2, "/Title/Sub" = H3, etc.
|
||||||
|
path_depth = section_path.count('/') if section_path else 1
|
||||||
|
section_introduced_at_level = max(1, path_depth + 1)
|
||||||
|
|
||||||
|
logger.debug(f"WP-26: Section-Type erkannt: '{new_section_type}' bei '{current_section_title}' (Level: {section_introduced_at_level})")
|
||||||
|
|
||||||
|
# [!section] Callout wird nicht als Block hinzugefügt (ist nur Metadaten)
|
||||||
|
processed_indices.add(i)
|
||||||
|
continue
|
||||||
|
|
||||||
# Vorherigen Text-Block abschließen
|
# Vorherigen Text-Block abschließen
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
if content:
|
if content:
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"paragraph", content, None, section_path, current_section_title,
|
"paragraph", content, None, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone
|
exclude_from_chunking=in_exclusion_zone,
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
buffer = []
|
buffer = []
|
||||||
|
|
||||||
|
|
@ -120,7 +165,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"callout", callout_content, None, section_path, current_section_title,
|
"callout", callout_content, None, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
|
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
|
||||||
is_meta_content=True # WP-24c v4.2.6: Markierung für spätere Entfernung
|
is_meta_content=True, # WP-24c v4.2.6: Markierung für spätere Entfernung
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -133,13 +180,32 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
if content:
|
if content:
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"paragraph", content, None, section_path, current_section_title,
|
"paragraph", content, None, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone
|
exclude_from_chunking=in_exclusion_zone,
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
buffer = []
|
buffer = []
|
||||||
|
|
||||||
level = len(heading_match.group(1))
|
level = len(heading_match.group(1))
|
||||||
title = heading_match.group(2).strip()
|
title = heading_match.group(2).strip()
|
||||||
|
|
||||||
|
# WP-26 v1.0: Block-ID aus Überschrift extrahieren (z.B. "## Titel ^block-id")
|
||||||
|
block_id_match = _BLOCK_ID_PATTERN.search(title)
|
||||||
|
if block_id_match:
|
||||||
|
current_block_id = block_id_match.group(1)
|
||||||
|
# Entferne Block-ID aus dem Titel für saubere Anzeige
|
||||||
|
title = _BLOCK_ID_PATTERN.sub('', title).strip()
|
||||||
|
else:
|
||||||
|
current_block_id = None
|
||||||
|
|
||||||
|
# WP-26 v1.0: Section-Type State-Machine
|
||||||
|
# Wenn eine Section eingeführt wurde und wir auf gleicher oder höherer Ebene sind:
|
||||||
|
# -> Automatisch neue Section erkennen (FA-02b)
|
||||||
|
if section_introduced_at_level is not None and level <= section_introduced_at_level:
|
||||||
|
# Neue Überschrift auf gleicher oder höherer Ebene -> Reset auf None (note_type Fallback)
|
||||||
|
current_section_type = None
|
||||||
|
logger.debug(f"WP-26: Neue Section erkannt bei H{level} '{title}' -> Reset auf note_type")
|
||||||
|
|
||||||
# WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
|
# WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
|
||||||
is_llm_validation_zone = (
|
is_llm_validation_zone = (
|
||||||
level == llm_validation_level and
|
level == llm_validation_level and
|
||||||
|
|
@ -170,7 +236,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
# Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
|
# Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"heading", stripped, level, section_path, current_section_title,
|
"heading", stripped, level, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone
|
exclude_from_chunking=in_exclusion_zone,
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -181,13 +249,17 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
if content:
|
if content:
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"paragraph", content, None, section_path, current_section_title,
|
"paragraph", content, None, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone
|
exclude_from_chunking=in_exclusion_zone,
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
buffer = []
|
buffer = []
|
||||||
if stripped == "---":
|
if stripped == "---":
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"separator", "---", None, section_path, current_section_title,
|
"separator", "---", None, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone
|
exclude_from_chunking=in_exclusion_zone,
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
buffer.append(line)
|
buffer.append(line)
|
||||||
|
|
@ -197,7 +269,9 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
if content:
|
if content:
|
||||||
blocks.append(RawBlock(
|
blocks.append(RawBlock(
|
||||||
"paragraph", content, None, section_path, current_section_title,
|
"paragraph", content, None, section_path, current_section_title,
|
||||||
exclude_from_chunking=in_exclusion_zone
|
exclude_from_chunking=in_exclusion_zone,
|
||||||
|
section_type=current_section_type,
|
||||||
|
block_id=current_block_id
|
||||||
))
|
))
|
||||||
|
|
||||||
return blocks, h1_title
|
return blocks, h1_title
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
|
||||||
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
|
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
|
||||||
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
|
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
|
||||||
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
||||||
|
WP-26 v1.0: section_type und block_id werden an Chunks weitergegeben.
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from .chunking_models import RawBlock, Chunk
|
from .chunking_models import RawBlock, Chunk
|
||||||
|
|
@ -36,41 +37,70 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
|
|
||||||
chunks: List[Chunk] = []
|
chunks: List[Chunk] = []
|
||||||
|
|
||||||
def _emit(txt, title, path):
|
def _emit(txt, title, path, section_type=None, block_id=None):
|
||||||
"""Schreibt den finalen Chunk ohne Text-Modifikationen."""
|
"""
|
||||||
|
Schreibt den finalen Chunk ohne Text-Modifikationen.
|
||||||
|
WP-26 v1.0: Erweitert um section_type und block_id.
|
||||||
|
"""
|
||||||
idx = len(chunks)
|
idx = len(chunks)
|
||||||
win = _create_win(context_prefix, title, txt)
|
win = _create_win(context_prefix, title, txt)
|
||||||
chunks.append(Chunk(
|
chunks.append(Chunk(
|
||||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||||
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None
|
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None,
|
||||||
|
section_type=section_type, block_id=block_id
|
||||||
))
|
))
|
||||||
|
|
||||||
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
||||||
|
# WP-26 v1.0: Erweitert um section_type und block_id Tracking
|
||||||
sections: List[Dict[str, Any]] = []
|
sections: List[Dict[str, Any]] = []
|
||||||
curr_blocks = []
|
curr_blocks = []
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
if b.kind == "heading" and b.level <= split_level:
|
if b.kind == "heading" and b.level <= split_level:
|
||||||
if curr_blocks:
|
if curr_blocks:
|
||||||
|
# WP-26 v1.0: Finde den effektiven section_type und block_id für diese Sektion
|
||||||
|
# Priorisiere den ersten Block mit section_type, sonst den Heading-Block
|
||||||
|
effective_section_type = None
|
||||||
|
effective_block_id = None
|
||||||
|
for cb in curr_blocks:
|
||||||
|
if cb.section_type and effective_section_type is None:
|
||||||
|
effective_section_type = cb.section_type
|
||||||
|
if cb.block_id and effective_block_id is None:
|
||||||
|
effective_block_id = cb.block_id
|
||||||
|
|
||||||
sections.append({
|
sections.append({
|
||||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||||
"meta": curr_blocks[0],
|
"meta": curr_blocks[0],
|
||||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
|
||||||
|
"section_type": effective_section_type,
|
||||||
|
"block_id": effective_block_id
|
||||||
})
|
})
|
||||||
curr_blocks = [b]
|
curr_blocks = [b]
|
||||||
else:
|
else:
|
||||||
curr_blocks.append(b)
|
curr_blocks.append(b)
|
||||||
if curr_blocks:
|
if curr_blocks:
|
||||||
|
# WP-26 v1.0: Gleiche Logik für den letzten Block
|
||||||
|
effective_section_type = None
|
||||||
|
effective_block_id = None
|
||||||
|
for cb in curr_blocks:
|
||||||
|
if cb.section_type and effective_section_type is None:
|
||||||
|
effective_section_type = cb.section_type
|
||||||
|
if cb.block_id and effective_block_id is None:
|
||||||
|
effective_block_id = cb.block_id
|
||||||
|
|
||||||
sections.append({
|
sections.append({
|
||||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||||
"meta": curr_blocks[0],
|
"meta": curr_blocks[0],
|
||||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading",
|
||||||
|
"section_type": effective_section_type,
|
||||||
|
"block_id": effective_block_id
|
||||||
})
|
})
|
||||||
|
|
||||||
# --- SCHRITT 2: Verarbeitung der Queue ---
|
# --- SCHRITT 2: Verarbeitung der Queue ---
|
||||||
queue = list(sections)
|
queue = list(sections)
|
||||||
current_chunk_text = ""
|
current_chunk_text = ""
|
||||||
current_meta = {"title": None, "path": "/"}
|
# WP-26 v1.0: Erweitert um section_type und block_id
|
||||||
|
current_meta = {"title": None, "path": "/", "section_type": None, "block_id": None}
|
||||||
|
|
||||||
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
|
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
|
||||||
is_hard_split_mode = (not smart_edge) or (strict)
|
is_hard_split_mode = (not smart_edge) or (strict)
|
||||||
|
|
@ -83,6 +113,9 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
if not current_chunk_text:
|
if not current_chunk_text:
|
||||||
current_meta["title"] = item["meta"].section_title
|
current_meta["title"] = item["meta"].section_title
|
||||||
current_meta["path"] = item["meta"].section_path
|
current_meta["path"] = item["meta"].section_path
|
||||||
|
# WP-26 v1.0: section_type und block_id aus Item übernehmen
|
||||||
|
current_meta["section_type"] = item.get("section_type")
|
||||||
|
current_meta["block_id"] = item.get("block_id")
|
||||||
|
|
||||||
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
|
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
|
||||||
if is_hard_split_mode:
|
if is_hard_split_mode:
|
||||||
|
|
@ -90,18 +123,23 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften
|
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften
|
||||||
if current_chunk_text:
|
if current_chunk_text:
|
||||||
# Flashe vorherigen Chunk
|
# Flashe vorherigen Chunk
|
||||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
current_chunk_text = ""
|
current_chunk_text = ""
|
||||||
|
|
||||||
# Neue Sektion: Initialisiere Meta
|
# Neue Sektion: Initialisiere Meta
|
||||||
current_meta["title"] = item["meta"].section_title
|
current_meta["title"] = item["meta"].section_title
|
||||||
current_meta["path"] = item["meta"].section_path
|
current_meta["path"] = item["meta"].section_path
|
||||||
|
# WP-26 v1.0: section_type und block_id aus Item übernehmen
|
||||||
|
current_meta["section_type"] = item.get("section_type")
|
||||||
|
current_meta["block_id"] = item.get("block_id")
|
||||||
|
|
||||||
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
|
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
|
||||||
# (nur Überschrift, kein Inhalt)
|
# (nur Überschrift, kein Inhalt)
|
||||||
if item.get("is_empty", False):
|
if item.get("is_empty", False):
|
||||||
# Leere Sektion: Nur Überschrift als Chunk
|
# Leere Sektion: Nur Überschrift als Chunk
|
||||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
_emit(item_text, current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
else:
|
else:
|
||||||
# Normale Sektion: Prüfe auf Token-Limit
|
# Normale Sektion: Prüfe auf Token-Limit
|
||||||
if estimate_tokens(item_text) > max_tokens:
|
if estimate_tokens(item_text) > max_tokens:
|
||||||
|
|
@ -113,16 +151,19 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
while sents:
|
while sents:
|
||||||
s = sents.pop(0); slen = estimate_tokens(s)
|
s = sents.pop(0); slen = estimate_tokens(s)
|
||||||
if take_len + slen > target and take_sents:
|
if take_len + slen > target and take_sents:
|
||||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
take_sents = [s]; take_len = slen
|
take_sents = [s]; take_len = slen
|
||||||
else:
|
else:
|
||||||
take_sents.append(s); take_len += slen
|
take_sents.append(s); take_len += slen
|
||||||
|
|
||||||
if take_sents:
|
if take_sents:
|
||||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
else:
|
else:
|
||||||
# Sektion passt: Direkt als Chunk
|
# Sektion passt: Direkt als Chunk
|
||||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
_emit(item_text, current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
|
|
||||||
current_chunk_text = ""
|
current_chunk_text = ""
|
||||||
continue
|
continue
|
||||||
|
|
@ -137,7 +178,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
else:
|
else:
|
||||||
if current_chunk_text:
|
if current_chunk_text:
|
||||||
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
|
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
|
||||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
current_chunk_text = ""
|
current_chunk_text = ""
|
||||||
queue.insert(0, item)
|
queue.insert(0, item)
|
||||||
else:
|
else:
|
||||||
|
|
@ -152,7 +194,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
sents.insert(0, s); break
|
sents.insert(0, s); break
|
||||||
take_sents.append(s); take_len += slen
|
take_sents.append(s); take_len += slen
|
||||||
|
|
||||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
|
|
||||||
if sents:
|
if sents:
|
||||||
remainder = " ".join(sents)
|
remainder = " ".join(sents)
|
||||||
|
|
@ -160,15 +203,21 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
if header_prefix and not remainder.startswith(header_prefix):
|
if header_prefix and not remainder.startswith(header_prefix):
|
||||||
remainder = header_prefix + "\n\n" + remainder
|
remainder = header_prefix + "\n\n" + remainder
|
||||||
# Carry-Over: Rest wird vorne in die Queue geschoben
|
# Carry-Over: Rest wird vorne in die Queue geschoben
|
||||||
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True})
|
# WP-26 v1.0: section_type und block_id weitergeben
|
||||||
|
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True,
|
||||||
|
"section_type": item.get("section_type"), "block_id": item.get("block_id")})
|
||||||
|
|
||||||
if current_chunk_text:
|
if current_chunk_text:
|
||||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"],
|
||||||
|
current_meta["section_type"], current_meta["block_id"])
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||||
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus."""
|
"""
|
||||||
|
Standard-Sliding-Window für flache Texte ohne Sektionsfokus.
|
||||||
|
WP-26 v1.0: Erweitert um section_type und block_id Weitergabe.
|
||||||
|
"""
|
||||||
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||||
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
||||||
|
|
||||||
|
|
@ -178,13 +227,31 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
||||||
if curr_tokens + b_tokens > max_tokens and buf:
|
if curr_tokens + b_tokens > max_tokens and buf:
|
||||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=curr_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
# WP-26 v1.0: Finde effektiven section_type und block_id
|
||||||
|
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
|
||||||
|
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
|
||||||
|
chunks.append(Chunk(
|
||||||
|
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||||
|
text=txt, window=win, token_count=curr_tokens,
|
||||||
|
section_title=buf[0].section_title, section_path=buf[0].section_path,
|
||||||
|
neighbors_prev=None, neighbors_next=None,
|
||||||
|
section_type=effective_section_type, block_id=effective_block_id
|
||||||
|
))
|
||||||
buf = []
|
buf = []
|
||||||
buf.append(b)
|
buf.append(b)
|
||||||
|
|
||||||
if buf:
|
if buf:
|
||||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
# WP-26 v1.0: Finde effektiven section_type und block_id
|
||||||
|
effective_section_type = next((b.section_type for b in buf if b.section_type), None)
|
||||||
|
effective_block_id = next((b.block_id for b in buf if b.block_id), None)
|
||||||
|
chunks.append(Chunk(
|
||||||
|
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||||
|
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||||
|
section_title=buf[0].section_title, section_path=buf[0].section_path,
|
||||||
|
neighbors_prev=None, neighbors_next=None,
|
||||||
|
section_type=effective_section_type, block_id=effective_block_id
|
||||||
|
))
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
@ -12,28 +12,85 @@ STATUS: Active
|
||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
import hashlib
|
import hashlib
|
||||||
from typing import Iterable, List, Optional, Set, Any, Tuple
|
from typing import Dict, Iterable, List, Optional, Set, Any, Tuple
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml
|
||||||
except ImportError:
|
except ImportError:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
|
||||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft
|
# WP-26 v1.0: Provenance-Literale auf valide EdgeDTO-Werte reduziert
|
||||||
|
# Legacy-Prioritäten für interne Verarbeitung (werden zu source_hint gemappt)
|
||||||
PROVENANCE_PRIORITY = {
|
PROVENANCE_PRIORITY = {
|
||||||
|
# Explizite Kanten (provenance: "explicit")
|
||||||
"explicit:wikilink": 1.00,
|
"explicit:wikilink": 1.00,
|
||||||
"inline:rel": 0.95,
|
"inline:rel": 0.95,
|
||||||
"callout:edge": 0.90,
|
"callout:edge": 0.90,
|
||||||
"explicit:callout": 0.90, # WP-24c v4.2.7: Callout-Kanten aus candidate_pool
|
"explicit:callout": 0.90,
|
||||||
"semantic_ai": 0.90, # Validierte KI-Kanten
|
|
||||||
"structure:belongs_to": 1.00,
|
|
||||||
"structure:order": 0.95, # next/prev
|
|
||||||
"explicit:note_scope": 1.00,
|
"explicit:note_scope": 1.00,
|
||||||
"explicit:note_zone": 1.00, # WP-24c v4.2.0: Note-Scope Zonen (höchste Priorität)
|
"explicit:note_zone": 1.00,
|
||||||
|
# Regel-basierte Kanten (provenance: "rule")
|
||||||
"derived:backlink": 0.90,
|
"derived:backlink": 0.90,
|
||||||
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
"edge_defaults": 0.70,
|
||||||
|
"schema_default": 0.85,
|
||||||
|
# Struktur-Kanten (provenance: "structure")
|
||||||
|
"structure:belongs_to": 1.00,
|
||||||
|
"structure:order": 0.95,
|
||||||
|
# KI-generierte Kanten (provenance: "smart")
|
||||||
|
"semantic_ai": 0.90,
|
||||||
|
"global_pool": 0.80,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# WP-26 v1.0: Mapping von internen Provenance-Werten zu EdgeDTO-konformen Literalen
|
||||||
|
PROVENANCE_TO_DTO = {
|
||||||
|
# explicit
|
||||||
|
"explicit:wikilink": ("explicit", "wikilink"),
|
||||||
|
"explicit:callout": ("explicit", "callout"),
|
||||||
|
"explicit:note_scope": ("explicit", "note_scope"),
|
||||||
|
"explicit:note_zone": ("explicit", "note_zone"),
|
||||||
|
"inline:rel": ("explicit", "inline_rel"),
|
||||||
|
"callout:edge": ("explicit", "callout"),
|
||||||
|
"explicit": ("explicit", None),
|
||||||
|
# rule
|
||||||
|
"derived:backlink": ("rule", "backlink"),
|
||||||
|
"edge_defaults": ("rule", "edge_defaults"),
|
||||||
|
"schema_default": ("rule", "schema_default"),
|
||||||
|
"inferred:schema": ("rule", "schema_default"),
|
||||||
|
"rule": ("rule", None),
|
||||||
|
# structure
|
||||||
|
"structure:belongs_to": ("structure", "belongs_to"),
|
||||||
|
"structure:order": ("structure", "order"),
|
||||||
|
"structure": ("structure", None),
|
||||||
|
# smart
|
||||||
|
"semantic_ai": ("smart", None),
|
||||||
|
"global_pool": ("smart", "global_pool"),
|
||||||
|
"smart": ("smart", None),
|
||||||
|
}
|
||||||
|
|
||||||
|
def normalize_provenance(internal_provenance: str) -> Tuple[str, Optional[str]]:
|
||||||
|
"""
|
||||||
|
WP-26 v1.0: Normalisiert interne Provenance-Werte zu EdgeDTO-konformen Literalen.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
internal_provenance: Interner Provenance-String (z.B. "explicit:callout")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple (provenance, source_hint) mit validen EdgeDTO-Werten
|
||||||
|
"""
|
||||||
|
if internal_provenance in PROVENANCE_TO_DTO:
|
||||||
|
return PROVENANCE_TO_DTO[internal_provenance]
|
||||||
|
|
||||||
|
# Fallback: Versuche Präfix-Matching
|
||||||
|
if internal_provenance.startswith("explicit"):
|
||||||
|
return ("explicit", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
|
||||||
|
if internal_provenance.startswith("structure"):
|
||||||
|
return ("structure", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
|
||||||
|
if internal_provenance.startswith("rule") or internal_provenance.startswith("derived"):
|
||||||
|
return ("rule", internal_provenance.split(":")[-1] if ":" in internal_provenance else None)
|
||||||
|
|
||||||
|
# Default: explicit ohne source_hint
|
||||||
|
return ("explicit", None)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
|
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -123,7 +180,15 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[
|
||||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||||
"""
|
"""
|
||||||
Konstruiert ein standardisiertes Kanten-Payload für Qdrant.
|
Konstruiert ein standardisiertes Kanten-Payload für Qdrant.
|
||||||
Wird von graph_derive_edges.py benötigt.
|
WP-26 v1.0: Erweitert um is_internal Flag und Provenance-Normalisierung.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
kind: Kantentyp (z.B. "derives", "caused_by")
|
||||||
|
scope: Granularität ("chunk" oder "note")
|
||||||
|
source_id: ID der Quelle (Chunk oder Note)
|
||||||
|
target_id: ID des Ziels (Chunk oder Note)
|
||||||
|
note_id: ID der Note (für Kontext)
|
||||||
|
extra: Zusätzliche Payload-Felder
|
||||||
"""
|
"""
|
||||||
pl = {
|
pl = {
|
||||||
"kind": kind,
|
"kind": kind,
|
||||||
|
|
@ -134,8 +199,24 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
|
||||||
"note_id": note_id,
|
"note_id": note_id,
|
||||||
"virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt
|
"virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# WP-26 v1.0: is_internal Flag berechnen
|
||||||
|
# Intra-Note-Edge: Source und Target gehören zur gleichen Note
|
||||||
|
source_note = source_id.split("#")[0] if "#" in source_id else source_id
|
||||||
|
target_note = target_id.split("#")[0] if "#" in target_id else target_id
|
||||||
|
pl["is_internal"] = (source_note == target_note) or (source_note == note_id and target_note == note_id)
|
||||||
|
|
||||||
if extra:
|
if extra:
|
||||||
pl.update(extra)
|
pl.update(extra)
|
||||||
|
|
||||||
|
# WP-26 v1.0: Provenance normalisieren, falls vorhanden
|
||||||
|
if "provenance" in extra:
|
||||||
|
internal_prov = extra["provenance"]
|
||||||
|
dto_prov, source_hint = normalize_provenance(internal_prov)
|
||||||
|
pl["provenance"] = dto_prov
|
||||||
|
if source_hint:
|
||||||
|
pl["source_hint"] = source_hint
|
||||||
|
|
||||||
return pl
|
return pl
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@ FILE: app/core/ingestion/ingestion_chunk_payload.py
|
||||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||||
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
||||||
WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution.
|
WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution.
|
||||||
VERSION: 2.4.4 (WP-24c v4.3.0)
|
WP-26 v1.0: Erweiterung um effective_type (section_type || note_type) und note_type-Feld.
|
||||||
|
VERSION: 2.5.0 (WP-26 v1.0)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -92,13 +93,34 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
|
||||||
# WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution
|
# WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution
|
||||||
candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", [])
|
candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", [])
|
||||||
|
|
||||||
|
# WP-26 v1.0: Section-Type für typ-spezifische Sektionen
|
||||||
|
section_type = getattr(ch, "section_type", None) if not is_dict else ch.get("section_type")
|
||||||
|
# WP-26 v1.0: Block-ID für Intra-Note-Links
|
||||||
|
block_id = getattr(ch, "block_id", None) if not is_dict else ch.get("block_id")
|
||||||
|
|
||||||
|
# WP-26 v1.0: Effektiver Typ = section_type || note_type (FA-03)
|
||||||
|
effective_type = section_type if section_type else note_type
|
||||||
|
|
||||||
|
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ (FA-09b)
|
||||||
|
# Wenn section_type vorhanden, nutze dessen retriever_weight
|
||||||
|
effective_rw = rw
|
||||||
|
if section_type:
|
||||||
|
effective_rw = _resolve_val(section_type, reg, "retriever_weight", rw)
|
||||||
|
try:
|
||||||
|
effective_rw = float(effective_rw)
|
||||||
|
except:
|
||||||
|
effective_rw = rw
|
||||||
|
|
||||||
pl: Dict[str, Any] = {
|
pl: Dict[str, Any] = {
|
||||||
"note_id": nid or fm.get("id"),
|
"note_id": nid or fm.get("id"),
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
"title": title,
|
"title": title,
|
||||||
"index": int(index),
|
"index": int(index),
|
||||||
"ord": int(index) + 1,
|
"ord": int(index) + 1,
|
||||||
"type": note_type,
|
# WP-26 v1.0: type enthält den effektiven Typ (section_type || note_type)
|
||||||
|
"type": effective_type,
|
||||||
|
# WP-26 v1.0: note_type ist immer der ursprüngliche Note-Typ (für Filterung)
|
||||||
|
"note_type": note_type,
|
||||||
"tags": tags,
|
"tags": tags,
|
||||||
"text": text,
|
"text": text,
|
||||||
"window": window,
|
"window": window,
|
||||||
|
|
@ -107,9 +129,13 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
|
||||||
"section": section,
|
"section": section,
|
||||||
"path": note_path,
|
"path": note_path,
|
||||||
"source_path": kwargs.get("file_path") or note_path,
|
"source_path": kwargs.get("file_path") or note_path,
|
||||||
"retriever_weight": rw,
|
# WP-26 v1.0: retriever_weight basiert auf effektivem Typ
|
||||||
|
"retriever_weight": effective_rw,
|
||||||
"chunk_profile": cp,
|
"chunk_profile": cp,
|
||||||
"candidate_pool": candidate_pool # WP-24c v4.3.0: Kritisch für Chunk-Attribution
|
"candidate_pool": candidate_pool, # WP-24c v4.3.0: Kritisch für Chunk-Attribution
|
||||||
|
# WP-26 v1.0: Optionale Felder für Section-Type-Tracking
|
||||||
|
"section_type": section_type, # Expliziter Section-Type (oder None)
|
||||||
|
"block_id": block_id, # Block-ID für Intra-Note-Links (oder None)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)
|
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)
|
||||||
|
|
|
||||||
|
|
@ -46,16 +46,18 @@ class EdgeDTO(BaseModel):
|
||||||
target: str
|
target: str
|
||||||
weight: float
|
weight: float
|
||||||
direction: Literal["out", "in", "undirected"] = "out"
|
direction: Literal["out", "in", "undirected"] = "out"
|
||||||
# WP-24c v4.5.3: Erweiterte Provenance-Werte für Chunk-Aware Edges
|
# WP-26 v1.0: Provenance auf valide Literale reduziert (EdgeDTO-Constraint)
|
||||||
# Unterstützt alle tatsächlich verwendeten Provenance-Typen im System
|
# Detail-Informationen werden über source_hint transportiert
|
||||||
provenance: Optional[Literal[
|
provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit"
|
||||||
"explicit", "rule", "smart", "structure",
|
# WP-26 v1.0: Neues Feld für Detail-Informationen zur Herkunft
|
||||||
"explicit:callout", "explicit:wikilink", "explicit:note_zone", "explicit:note_scope",
|
source_hint: Optional[Literal[
|
||||||
"inline:rel", "callout:edge", "semantic_ai", "structure:belongs_to", "structure:order",
|
"callout", "wikilink", "inline_rel", "schema_default", "note_scope",
|
||||||
"derived:backlink", "edge_defaults", "global_pool"
|
"note_zone", "belongs_to", "order", "backlink", "edge_defaults", "global_pool"
|
||||||
]] = "explicit"
|
]] = None
|
||||||
confidence: float = 1.0
|
confidence: float = 1.0
|
||||||
target_section: Optional[str] = None
|
target_section: Optional[str] = None
|
||||||
|
# WP-26 v1.0: Flag für Intra-Note-Edges
|
||||||
|
is_internal: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
# --- Request Models ---
|
# --- Request Models ---
|
||||||
|
|
|
||||||
284
docs/05_Development/05_WP26_Manual_Testing.md
Normal file
284
docs/05_Development/05_WP26_Manual_Testing.md
Normal file
|
|
@ -0,0 +1,284 @@
|
||||||
|
# WP-26 Manuelle Testszenarien
|
||||||
|
|
||||||
|
**Version:** 1.0
|
||||||
|
**Datum:** 25. Januar 2026
|
||||||
|
**Status:** Phase 1 Implementierung abgeschlossen
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Überblick
|
||||||
|
|
||||||
|
Dieses Dokument beschreibt die manuellen Testszenarien für WP-26 Phase 1: Section-Types und Intra-Note-Edges.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Voraussetzungen
|
||||||
|
|
||||||
|
1. **Python-Umgebung** mit allen Dependencies aus `requirements.txt`
|
||||||
|
2. **Qdrant-Instanz** erreichbar (lokal oder Docker)
|
||||||
|
3. **Vault mit Test-Note** (siehe Abschnitt 3)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Test-Note erstellen
|
||||||
|
|
||||||
|
Erstelle eine neue Markdown-Datei im Vault mit folgendem Inhalt:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
id: wp26-test-experience
|
||||||
|
title: WP-26 Test Experience
|
||||||
|
type: experience
|
||||||
|
tags: [test, wp26]
|
||||||
|
---
|
||||||
|
|
||||||
|
# WP-26 Test Experience
|
||||||
|
|
||||||
|
## Situation ^sit
|
||||||
|
> [!section] experience
|
||||||
|
|
||||||
|
Am 25. Januar 2026 testete ich das neue Section-Type Feature.
|
||||||
|
Dies ist der Experience-Teil der Note.
|
||||||
|
|
||||||
|
## Meine Reaktion ^react
|
||||||
|
> [!section] experience
|
||||||
|
|
||||||
|
> [!edge] followed_by
|
||||||
|
> [[#^sit]]
|
||||||
|
|
||||||
|
Ich war zunächst skeptisch, aber die Implementierung sah solide aus.
|
||||||
|
|
||||||
|
## Reflexion ^ref
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Diese Erfahrung zeigt mir, dass typ-spezifische Sektionen
|
||||||
|
die semantische Präzision des Retrievals verbessern können.
|
||||||
|
|
||||||
|
> [!abstract] Semantic Edges
|
||||||
|
>> [!edge] derives
|
||||||
|
>> [[#^sit]]
|
||||||
|
>> [[#^react]]
|
||||||
|
|
||||||
|
## Nächste Schritte ^next
|
||||||
|
> [!section] decision
|
||||||
|
|
||||||
|
Ich werde:
|
||||||
|
1. Die Tests ausführen
|
||||||
|
2. Die Ergebnisse dokumentieren
|
||||||
|
|
||||||
|
> [!edge] caused_by
|
||||||
|
> [[#^ref]]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Testszenarien
|
||||||
|
|
||||||
|
### 4.1 TS-01: Section-Type-Erkennung
|
||||||
|
|
||||||
|
**Ziel:** Prüfen, ob `[!section]`-Callouts korrekt erkannt werden.
|
||||||
|
|
||||||
|
**Schritte:**
|
||||||
|
|
||||||
|
1. Importiere die Test-Note via `scripts/import_markdown.py`
|
||||||
|
2. Prüfe die Chunks in Qdrant via API oder Debug-Skript
|
||||||
|
|
||||||
|
**Prüfkriterien:**
|
||||||
|
|
||||||
|
| Chunk | Erwarteter `type` | Erwarteter `note_type` | Erwarteter `section` |
|
||||||
|
|-------|-------------------|------------------------|----------------------|
|
||||||
|
| #c00 | experience | experience | Situation |
|
||||||
|
| #c01 | experience | experience | Meine Reaktion |
|
||||||
|
| #c02 | insight | experience | Reflexion |
|
||||||
|
| #c03 | decision | experience | Nächste Schritte |
|
||||||
|
|
||||||
|
**Prüf-Script:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# scripts/check_wp26_chunks.py
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
|
||||||
|
client = QdrantClient("http://localhost:6333")
|
||||||
|
note_id = "wp26-test-experience"
|
||||||
|
|
||||||
|
# Hole alle Chunks der Note
|
||||||
|
result = client.scroll(
|
||||||
|
collection_name="mindnet_chunks",
|
||||||
|
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
|
||||||
|
with_payload=True,
|
||||||
|
limit=100
|
||||||
|
)
|
||||||
|
|
||||||
|
for point in result[0]:
|
||||||
|
p = point.payload
|
||||||
|
print(f"Chunk: {p.get('chunk_id')}")
|
||||||
|
print(f" type: {p.get('type')}")
|
||||||
|
print(f" note_type: {p.get('note_type')}")
|
||||||
|
print(f" section: {p.get('section')}")
|
||||||
|
print(f" section_type: {p.get('section_type')}")
|
||||||
|
print(f" block_id: {p.get('block_id')}")
|
||||||
|
print()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4.2 TS-02: Block-ID-Erkennung
|
||||||
|
|
||||||
|
**Ziel:** Prüfen, ob Block-IDs (`^id`) aus Überschriften korrekt extrahiert werden.
|
||||||
|
|
||||||
|
**Prüfkriterien:**
|
||||||
|
|
||||||
|
| Chunk | Erwartete `block_id` |
|
||||||
|
|-------|---------------------|
|
||||||
|
| #c00 | sit |
|
||||||
|
| #c01 | react |
|
||||||
|
| #c02 | ref |
|
||||||
|
| #c03 | next |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4.3 TS-03: is_internal Flag für Edges
|
||||||
|
|
||||||
|
**Ziel:** Prüfen, ob Intra-Note-Edges das `is_internal: true` Flag erhalten.
|
||||||
|
|
||||||
|
**Schritte:**
|
||||||
|
|
||||||
|
1. Importiere die Test-Note
|
||||||
|
2. Prüfe die Edges in Qdrant
|
||||||
|
|
||||||
|
**Prüfkriterien:**
|
||||||
|
|
||||||
|
| Edge | `is_internal` |
|
||||||
|
|------|---------------|
|
||||||
|
| #c01 → #c00 (followed_by) | `true` |
|
||||||
|
| #c02 → #c00 (derives) | `true` |
|
||||||
|
| #c02 → #c01 (derives) | `true` |
|
||||||
|
| #c03 → #c02 (caused_by) | `true` |
|
||||||
|
| Alle structure edges (next/prev) | `true` |
|
||||||
|
|
||||||
|
**Prüf-Script:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# scripts/check_wp26_edges.py
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
|
||||||
|
client = QdrantClient("http://localhost:6333")
|
||||||
|
note_id = "wp26-test-experience"
|
||||||
|
|
||||||
|
# Hole alle Edges der Note
|
||||||
|
result = client.scroll(
|
||||||
|
collection_name="mindnet_edges",
|
||||||
|
scroll_filter={"must": [{"key": "note_id", "match": {"value": note_id}}]},
|
||||||
|
with_payload=True,
|
||||||
|
limit=100
|
||||||
|
)
|
||||||
|
|
||||||
|
for point in result[0]:
|
||||||
|
p = point.payload
|
||||||
|
kind = p.get('kind', 'unknown')
|
||||||
|
source = p.get('source_id', '?')
|
||||||
|
target = p.get('target_id', '?')
|
||||||
|
is_internal = p.get('is_internal', 'MISSING')
|
||||||
|
provenance = p.get('provenance', '?')
|
||||||
|
source_hint = p.get('source_hint', '-')
|
||||||
|
|
||||||
|
print(f"{source} --[{kind}]--> {target}")
|
||||||
|
print(f" is_internal: {is_internal}")
|
||||||
|
print(f" provenance: {provenance}")
|
||||||
|
print(f" source_hint: {source_hint}")
|
||||||
|
print()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4.4 TS-04: Provenance-Normalisierung
|
||||||
|
|
||||||
|
**Ziel:** Prüfen, ob Provenance-Werte korrekt normalisiert werden.
|
||||||
|
|
||||||
|
**Prüfkriterien:**
|
||||||
|
|
||||||
|
| Altes Provenance | Neues `provenance` | `source_hint` |
|
||||||
|
|------------------|-------------------|---------------|
|
||||||
|
| explicit:callout | explicit | callout |
|
||||||
|
| explicit:wikilink | explicit | wikilink |
|
||||||
|
| structure:belongs_to | structure | belongs_to |
|
||||||
|
| structure:order | structure | order |
|
||||||
|
| edge_defaults | rule | edge_defaults |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4.5 TS-05: Automatische Section-Erkennung
|
||||||
|
|
||||||
|
**Ziel:** Prüfen, ob neue Überschriften ohne `[!section]` automatisch neue Chunks erstellen.
|
||||||
|
|
||||||
|
**Test-Note:**
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
id: wp26-test-auto-section
|
||||||
|
type: experience
|
||||||
|
---
|
||||||
|
|
||||||
|
# Test Auto Section
|
||||||
|
|
||||||
|
## Section A ^a
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Content A (insight).
|
||||||
|
|
||||||
|
## Section B ^b
|
||||||
|
|
||||||
|
Content B (sollte experience sein - Fallback).
|
||||||
|
|
||||||
|
## Section C ^c
|
||||||
|
> [!section] decision
|
||||||
|
|
||||||
|
Content C (decision).
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prüfkriterien:**
|
||||||
|
|
||||||
|
| Chunk | `type` | Grund |
|
||||||
|
|-------|--------|-------|
|
||||||
|
| Section A | insight | Explizites `[!section]` |
|
||||||
|
| Section B | experience | Fallback auf `note_type` |
|
||||||
|
| Section C | decision | Explizites `[!section]` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Unit-Tests ausführen
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Im Projekt-Root
|
||||||
|
cd c:\Dev\cursor\mindnet
|
||||||
|
|
||||||
|
# Aktiviere virtuelle Umgebung (falls vorhanden)
|
||||||
|
# .venv\Scripts\activate
|
||||||
|
|
||||||
|
# Führe WP-26 Tests aus
|
||||||
|
python -m pytest tests/test_wp26_section_types.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
**Erwartetes Ergebnis:** Alle Tests grün.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Bekannte Einschränkungen
|
||||||
|
|
||||||
|
1. **Block-ID-Stability:** Obsidian aktualisiert Block-IDs nicht automatisch bei Umbenennung von Überschriften.
|
||||||
|
2. **Heading-Links:** Links wie `[[#Section Name]]` werden unterstützt, aber Block-References (`[[#^id]]`) werden bevorzugt.
|
||||||
|
3. **Nested Callouts:** Verschachtelte Callouts (`>> [!edge]`) werden korrekt verarbeitet.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Nächste Schritte (Phase 2)
|
||||||
|
|
||||||
|
Nach erfolgreicher Validierung von Phase 1:
|
||||||
|
|
||||||
|
1. **Retriever-Anpassung:** Path-Bonus für Intra-Note-Edges
|
||||||
|
2. **Graph-Exploration:** Navigation entlang `typical edges` aus `graph_schema.md`
|
||||||
|
3. **Schema-Validierung:** Agentic Validation gegen effektive Chunk-Typen
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Ende der Testdokumentation**
|
||||||
1470
docs/06_Roadmap/06_LH_Section_Types_Intra_Note_Edges.md
Normal file
1470
docs/06_Roadmap/06_LH_Section_Types_Intra_Note_Edges.md
Normal file
File diff suppressed because it is too large
Load Diff
265
tests/test_wp26_section_types.py
Normal file
265
tests/test_wp26_section_types.py
Normal file
|
|
@ -0,0 +1,265 @@
|
||||||
|
"""
|
||||||
|
FILE: tests/test_wp26_section_types.py
|
||||||
|
DESCRIPTION: Unit-Tests für WP-26 Phase 1: Section-Types und Intra-Note-Edges
|
||||||
|
VERSION: 1.0.0
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
from app.core.chunking.chunking_parser import parse_blocks
|
||||||
|
from app.core.chunking.chunking_models import RawBlock, Chunk
|
||||||
|
from app.core.graph.graph_utils import normalize_provenance, _edge
|
||||||
|
|
||||||
|
|
||||||
|
class TestSectionTypeRecognition:
|
||||||
|
"""UT-01: Parser – Section-Type-Erkennung"""
|
||||||
|
|
||||||
|
def test_section_type_recognition(self):
|
||||||
|
"""Testet, ob [!section]-Callouts korrekt erkannt werden."""
|
||||||
|
md = """
|
||||||
|
## Reflexion ^ref
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Content here about insights.
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Finde den Paragraph-Block nach dem Section-Callout
|
||||||
|
paragraph_blocks = [b for b in blocks if b.kind == "paragraph"]
|
||||||
|
assert len(paragraph_blocks) >= 1
|
||||||
|
|
||||||
|
# Der Paragraph-Block sollte section_type "insight" haben
|
||||||
|
assert paragraph_blocks[0].section_type == "insight"
|
||||||
|
|
||||||
|
def test_section_type_with_block_id(self):
|
||||||
|
"""Testet, ob Block-IDs in Überschriften korrekt extrahiert werden."""
|
||||||
|
md = """
|
||||||
|
## Situation ^sit
|
||||||
|
> [!section] experience
|
||||||
|
|
||||||
|
Die Geschichte beginnt hier.
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Finde den Heading-Block
|
||||||
|
heading_blocks = [b for b in blocks if b.kind == "heading"]
|
||||||
|
assert len(heading_blocks) >= 1
|
||||||
|
|
||||||
|
# Block-ID sollte "sit" sein
|
||||||
|
assert heading_blocks[0].block_id == "sit"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSectionTypeScope:
|
||||||
|
"""UT-02: Parser – Scope-Beendigung"""
|
||||||
|
|
||||||
|
def test_section_type_scope_ends_at_same_level_heading(self):
|
||||||
|
"""Section-Type endet bei nächster H2."""
|
||||||
|
md = """
|
||||||
|
## Section A
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Content A with insight.
|
||||||
|
|
||||||
|
## Section B
|
||||||
|
|
||||||
|
Content B without section callout.
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Finde Paragraph-Blöcke
|
||||||
|
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||||
|
|
||||||
|
# Erster Paragraph hat section_type "insight"
|
||||||
|
assert paragraphs[0].section_type == "insight"
|
||||||
|
|
||||||
|
# Zweiter Paragraph hat section_type None (Reset)
|
||||||
|
assert paragraphs[1].section_type is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestProvenanceNormalization:
|
||||||
|
"""UT für Provenance-Normalisierung (WP-26 v1.0)"""
|
||||||
|
|
||||||
|
def test_normalize_explicit_callout(self):
|
||||||
|
"""explicit:callout -> (explicit, callout)"""
|
||||||
|
prov, hint = normalize_provenance("explicit:callout")
|
||||||
|
assert prov == "explicit"
|
||||||
|
assert hint == "callout"
|
||||||
|
|
||||||
|
def test_normalize_explicit_wikilink(self):
|
||||||
|
"""explicit:wikilink -> (explicit, wikilink)"""
|
||||||
|
prov, hint = normalize_provenance("explicit:wikilink")
|
||||||
|
assert prov == "explicit"
|
||||||
|
assert hint == "wikilink"
|
||||||
|
|
||||||
|
def test_normalize_structure_belongs_to(self):
|
||||||
|
"""structure:belongs_to -> (structure, belongs_to)"""
|
||||||
|
prov, hint = normalize_provenance("structure:belongs_to")
|
||||||
|
assert prov == "structure"
|
||||||
|
assert hint == "belongs_to"
|
||||||
|
|
||||||
|
def test_normalize_schema_default(self):
|
||||||
|
"""inferred:schema -> (rule, schema_default)"""
|
||||||
|
prov, hint = normalize_provenance("inferred:schema")
|
||||||
|
assert prov == "rule"
|
||||||
|
assert hint == "schema_default"
|
||||||
|
|
||||||
|
def test_normalize_unknown_fallback(self):
|
||||||
|
"""Unbekannte Provenance -> (explicit, None)"""
|
||||||
|
prov, hint = normalize_provenance("unknown_provenance")
|
||||||
|
assert prov == "explicit"
|
||||||
|
assert hint is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsInternalFlag:
|
||||||
|
"""UT-13: is_internal Flag für Intra-Note-Edges"""
|
||||||
|
|
||||||
|
def test_is_internal_true_for_same_note(self):
|
||||||
|
"""Edges zwischen Chunks derselben Note haben is_internal=True"""
|
||||||
|
edge = _edge(
|
||||||
|
kind="derives",
|
||||||
|
scope="chunk",
|
||||||
|
source_id="note1#c01",
|
||||||
|
target_id="note1#c02",
|
||||||
|
note_id="note1"
|
||||||
|
)
|
||||||
|
assert edge["is_internal"] is True
|
||||||
|
|
||||||
|
def test_is_internal_false_for_different_notes(self):
|
||||||
|
"""Edges zwischen verschiedenen Notes haben is_internal=False"""
|
||||||
|
edge = _edge(
|
||||||
|
kind="references",
|
||||||
|
scope="chunk",
|
||||||
|
source_id="note1#c01",
|
||||||
|
target_id="note2#c01",
|
||||||
|
note_id="note1"
|
||||||
|
)
|
||||||
|
assert edge["is_internal"] is False
|
||||||
|
|
||||||
|
def test_is_internal_true_for_note_to_chunk(self):
|
||||||
|
"""Edges von Note zu eigenem Chunk haben is_internal=True"""
|
||||||
|
edge = _edge(
|
||||||
|
kind="belongs_to",
|
||||||
|
scope="chunk",
|
||||||
|
source_id="note1#c01",
|
||||||
|
target_id="note1",
|
||||||
|
note_id="note1"
|
||||||
|
)
|
||||||
|
assert edge["is_internal"] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestEdgeProvenanceInPayload:
|
||||||
|
"""Test für Provenance-Normalisierung in Edge-Payloads"""
|
||||||
|
|
||||||
|
def test_edge_provenance_normalized(self):
|
||||||
|
"""Provenance wird in Edge-Payloads normalisiert"""
|
||||||
|
edge = _edge(
|
||||||
|
kind="derives",
|
||||||
|
scope="chunk",
|
||||||
|
source_id="note1#c01",
|
||||||
|
target_id="note1#c02",
|
||||||
|
note_id="note1",
|
||||||
|
extra={"provenance": "explicit:callout"}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert edge["provenance"] == "explicit"
|
||||||
|
assert edge["source_hint"] == "callout"
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutomaticSectionRecognition:
|
||||||
|
"""UT-09: Automatische Section-Erkennung bei neuen Überschriften"""
|
||||||
|
|
||||||
|
def test_automatic_section_recognition_at_same_heading_level(self):
|
||||||
|
"""Neue Überschriften auf gleicher Ebene starten automatisch neue Sections"""
|
||||||
|
md = """
|
||||||
|
## Situation ^sit
|
||||||
|
> [!section] experience
|
||||||
|
|
||||||
|
Content A.
|
||||||
|
|
||||||
|
## Reflexion ^ref
|
||||||
|
|
||||||
|
Content B.
|
||||||
|
|
||||||
|
## Learnings ^learn
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Content C.
|
||||||
|
|
||||||
|
## Ausblick ^out
|
||||||
|
|
||||||
|
Content D.
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Sammle alle Paragraph-Blöcke in Reihenfolge
|
||||||
|
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||||
|
|
||||||
|
assert len(paragraphs) == 4
|
||||||
|
|
||||||
|
# Chunk 1: Expliziter section_type "experience"
|
||||||
|
assert paragraphs[0].section_type == "experience"
|
||||||
|
|
||||||
|
# Chunk 2: Neue Section ohne Callout → None (Fallback auf note_type)
|
||||||
|
assert paragraphs[1].section_type is None
|
||||||
|
|
||||||
|
# Chunk 3: Expliziter section_type "insight"
|
||||||
|
assert paragraphs[2].section_type == "insight"
|
||||||
|
|
||||||
|
# Chunk 4: Neue Section ohne Callout → None (Fallback auf note_type)
|
||||||
|
assert paragraphs[3].section_type is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSeparateSectionCallout:
|
||||||
|
"""UT-10: Separates Section-Callout an beliebiger Stelle"""
|
||||||
|
|
||||||
|
def test_section_callout_separate_from_edge_callout(self):
|
||||||
|
"""Section-Callout kann separat von Edge-Callouts stehen"""
|
||||||
|
md = """
|
||||||
|
## Reflexion ^ref
|
||||||
|
|
||||||
|
Einleitender Text hier...
|
||||||
|
|
||||||
|
> [!section] insight
|
||||||
|
|
||||||
|
Weiterer normaler Inhalt...
|
||||||
|
|
||||||
|
> [!edge] derives
|
||||||
|
> [[#^sit]]
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Finde Paragraph-Blöcke nach dem Section-Callout
|
||||||
|
paragraphs = [b for b in blocks if b.kind == "paragraph"]
|
||||||
|
|
||||||
|
# Es sollten mindestens 2 Paragraphen geben
|
||||||
|
assert len(paragraphs) >= 2
|
||||||
|
|
||||||
|
# Der erste Paragraph hat noch keinen section_type (vor dem Callout)
|
||||||
|
# Der zweite Paragraph hat section_type "insight"
|
||||||
|
# Hinweis: Die genaue Zuordnung hängt von der Parser-Implementierung ab
|
||||||
|
section_types = [p.section_type for p in paragraphs]
|
||||||
|
assert "insight" in section_types
|
||||||
|
|
||||||
|
|
||||||
|
class TestNestedEdgeCallouts:
|
||||||
|
"""UT-08: Verschachtelte Edge-Callouts in Container"""
|
||||||
|
|
||||||
|
def test_nested_callouts_recognized(self):
|
||||||
|
"""Verschachtelte Callouts werden als Callout-Blöcke erkannt"""
|
||||||
|
md = """
|
||||||
|
> [!abstract] Semantic Edges
|
||||||
|
>> [!edge] derived_from
|
||||||
|
>> [[Target1#Section]]
|
||||||
|
>
|
||||||
|
>> [!edge] solves
|
||||||
|
>> [[Target2]]
|
||||||
|
"""
|
||||||
|
blocks, _ = parse_blocks(md)
|
||||||
|
|
||||||
|
# Finde Callout-Blöcke
|
||||||
|
callouts = [b for b in blocks if b.kind == "callout"]
|
||||||
|
|
||||||
|
# Es sollte mindestens ein Callout-Block erkannt werden
|
||||||
|
assert len(callouts) >= 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Loading…
Reference in New Issue
Block a user