Enhance chunking system with WP-24c v4.2.6 and v4.2.7 updates: Introduce is_meta_content flag for callouts in RawBlock, ensuring they are chunked but later removed for clean context. Update parse_blocks and propagate_section_edges to handle callout edges with explicit provenance for chunk attribution. Implement clean-context logic to remove callout syntax post-processing, maintaining chunk integrity. Adjust get_chunk_config to prioritize frontmatter overrides for chunking profiles. Update documentation to reflect these changes.
This commit is contained in:
parent
4d43cc526e
commit
55b64c331a
|
|
@ -14,6 +14,7 @@ class RawBlock:
|
|||
section_path: str
|
||||
section_title: Optional[str]
|
||||
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
|
||||
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
|
|
|
|||
|
|
@ -4,10 +4,11 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
|||
Hält alle Überschriftenebenen (H1-H6) im Stream.
|
||||
Stellt die Funktion parse_edges_robust zur Verfügung.
|
||||
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
|
||||
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
from typing import List, Tuple, Set
|
||||
from typing import List, Tuple, Set, Dict, Any
|
||||
from .chunking_models import RawBlock
|
||||
from .chunking_utils import extract_frontmatter_from_text
|
||||
|
||||
|
|
@ -25,6 +26,7 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
"""
|
||||
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
|
||||
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
|
||||
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
|
||||
"""
|
||||
blocks = []
|
||||
h1_title = "Dokument"
|
||||
|
|
@ -67,9 +69,61 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
lines = text_without_fm.split('\n')
|
||||
buffer = []
|
||||
|
||||
for line in lines:
|
||||
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
|
||||
# Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >)
|
||||
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
|
||||
|
||||
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
|
||||
processed_indices = set()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if i in processed_indices:
|
||||
continue
|
||||
|
||||
stripped = line.strip()
|
||||
|
||||
# WP-24c v4.2.5: Callout-Erkennung (VOR Heading-Erkennung)
|
||||
# Prüfe, ob diese Zeile ein Callout startet
|
||||
callout_match = callout_pattern.match(line)
|
||||
if callout_match:
|
||||
# Vorherigen Text-Block abschließen
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
buffer = []
|
||||
|
||||
# Sammle alle Zeilen des Callout-Blocks
|
||||
callout_lines = [line]
|
||||
leading_gt_count = len(line) - len(line.lstrip('>'))
|
||||
processed_indices.add(i)
|
||||
|
||||
# Sammle alle Zeilen, die zum Callout gehören (gleiche oder höhere Einrückung)
|
||||
j = i + 1
|
||||
while j < len(lines):
|
||||
next_line = lines[j]
|
||||
if not next_line.strip().startswith('>'):
|
||||
break
|
||||
next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
|
||||
if next_leading_gt < leading_gt_count:
|
||||
break
|
||||
callout_lines.append(next_line)
|
||||
processed_indices.add(j)
|
||||
j += 1
|
||||
|
||||
# WP-24c v4.2.6: Erstelle Callout-Block mit is_meta_content = True
|
||||
# Callouts werden gechunkt (für Chunk-Attribution), aber später entfernt (Clean-Context)
|
||||
callout_content = "\n".join(callout_lines)
|
||||
blocks.append(RawBlock(
|
||||
"callout", callout_content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
|
||||
is_meta_content=True # WP-24c v4.2.6: Markierung für spätere Entfernung
|
||||
))
|
||||
continue
|
||||
|
||||
# Heading-Erkennung (H1 bis H6)
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
|
||||
if heading_match:
|
||||
|
|
@ -148,15 +202,22 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
|
||||
return blocks, h1_title
|
||||
|
||||
def parse_edges_robust(text: str) -> Set[str]:
|
||||
"""Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts."""
|
||||
found_edges = set()
|
||||
def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
|
||||
WP-24c v4.2.7: Gibt Liste von Dicts zurück mit is_callout Flag für Chunk-Attribution.
|
||||
|
||||
Returns:
|
||||
List[Dict] mit keys: "edge" (str: "kind:target"), "is_callout" (bool)
|
||||
"""
|
||||
found_edges: List[Dict[str, any]] = []
|
||||
# 1. Wikilinks [[rel:kind|target]]
|
||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||
for kind, target in inlines:
|
||||
k = kind.strip().lower()
|
||||
t = target.strip()
|
||||
if k and t: found_edges.add(f"{k}:{t}")
|
||||
if k and t:
|
||||
found_edges.append({"edge": f"{k}:{t}", "is_callout": False})
|
||||
|
||||
# 2. Callout Edges > [!edge] kind
|
||||
lines = text.split('\n')
|
||||
|
|
@ -169,13 +230,15 @@ def parse_edges_robust(text: str) -> Set[str]:
|
|||
# Links in der gleichen Zeile des Callouts
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||
if "rel:" not in l:
|
||||
found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
|
||||
continue
|
||||
# Links in Folgezeilen des Callouts
|
||||
if current_edge_type and stripped.startswith('>'):
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||
if "rel:" not in l:
|
||||
found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
|
||||
elif not stripped.startswith('>'):
|
||||
current_edge_type = None
|
||||
return found_edges
|
||||
|
|
@ -7,6 +7,16 @@ DESCRIPTION: Der zentrale Orchestrator für das Chunking-System.
|
|||
- Stellt H1-Kontext-Fenster sicher.
|
||||
- Baut den Candidate-Pool für die WP-15b Ingestion auf.
|
||||
WP-24c v4.2.0: Konfigurierbare Header-Namen für LLM-Validierung.
|
||||
WP-24c v4.2.5: Wiederherstellung der Chunking-Präzision
|
||||
- Frontmatter-Override für chunking_profile
|
||||
- Callout-Exclusion aus Chunks
|
||||
- Strict-Mode ohne Carry-Over
|
||||
WP-24c v4.2.6: Finale Härtung - "Semantic First, Clean Second"
|
||||
- Callouts werden gechunkt (Chunk-Attribution), aber später entfernt (Clean-Context)
|
||||
- remove_callouts_from_text erst nach propagate_section_edges und Candidate Pool
|
||||
WP-24c v4.2.7: Wiederherstellung der Chunk-Attribution
|
||||
- Callout-Kanten erhalten explicit:callout Provenance im candidate_pool
|
||||
- graph_derive_edges.py erkennt diese und verhindert Note-Scope Duplikate
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
|
|
@ -25,16 +35,19 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
"""
|
||||
Hauptfunktion zur Zerlegung einer Note.
|
||||
Verbindet Strategien mit physikalischer Kontext-Anreicherung.
|
||||
WP-24c v4.2.5: Frontmatter-Override für chunking_profile wird berücksichtigt.
|
||||
"""
|
||||
# 1. Konfiguration & Parsing
|
||||
if config is None:
|
||||
config = get_chunk_config(note_type)
|
||||
|
||||
# 1. WP-24c v4.2.5: Frontmatter VOR Konfiguration extrahieren (für Override)
|
||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||
|
||||
# 2. Konfiguration mit Frontmatter-Override
|
||||
if config is None:
|
||||
config = get_chunk_config(note_type, frontmatter=fm)
|
||||
|
||||
blocks, doc_title = parse_blocks(md_text)
|
||||
|
||||
# WP-24c v4.2.0: Filtere Blöcke aus Edge-Zonen (LLM-Validierung & Note-Scope)
|
||||
# Diese Bereiche sollen nicht als Chunks angelegt werden, sondern nur die Kanten extrahiert werden
|
||||
# WP-24c v4.2.6: Filtere NUR Edge-Zonen (LLM-Validierung & Note-Scope)
|
||||
# Callouts (is_meta_content=True) müssen durch, damit Chunk-Attribution erhalten bleibt
|
||||
blocks_for_chunking = [b for b in blocks if not getattr(b, 'exclude_from_chunking', False)]
|
||||
|
||||
# Vorbereitung des H1-Präfix für die Embedding-Fenster (Breadcrumbs)
|
||||
|
|
@ -42,6 +55,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
|
||||
# 2. Anwendung der Splitting-Strategie
|
||||
# Alle Strategien nutzen nun einheitlich context_prefix für die Window-Bildung.
|
||||
# WP-24c v4.2.6: Callouts sind in blocks_for_chunking enthalten (für Chunk-Attribution)
|
||||
if config.get("strategy") == "by_heading":
|
||||
chunks = await asyncio.to_thread(
|
||||
strategy_by_heading, blocks_for_chunking, config, note_id, context_prefix=h1_prefix
|
||||
|
|
@ -55,21 +69,27 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
return []
|
||||
|
||||
# 3. Physikalische Kontext-Anreicherung (Der Qualitäts-Fix)
|
||||
# WP-24c v4.2.6: Arbeite auf Original-Text inkl. Callouts (für korrekte Chunk-Attribution)
|
||||
# Schreibt Kanten aus Callouts/Inlines hart in den Text für Qdrant.
|
||||
chunks = propagate_section_edges(chunks)
|
||||
|
||||
# 4. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
|
||||
# 5. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
|
||||
# WP-24c v4.2.7: Markiere Callout-Kanten explizit für Chunk-Attribution
|
||||
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
||||
for ch in chunks:
|
||||
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
||||
# ch.candidate_pool wird im Modell-Konstruktor als leere Liste initialisiert.
|
||||
for e_str in parse_edges_robust(ch.text):
|
||||
parts = e_str.split(':', 1)
|
||||
for edge_info in parse_edges_robust(ch.text):
|
||||
edge_str = edge_info["edge"]
|
||||
is_callout = edge_info.get("is_callout", False)
|
||||
parts = edge_str.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
k, t = parts
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"})
|
||||
# WP-24c v4.2.7: Callout-Kanten erhalten explicit:callout Provenance
|
||||
provenance = "explicit:callout" if is_callout else "explicit"
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": provenance})
|
||||
|
||||
# 5. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
|
||||
# 6. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
|
||||
# WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebene via .env
|
||||
# Sucht nach ALLEN Edge-Pool Blöcken im Original-Markdown (nicht nur am Ende).
|
||||
llm_validation_headers = os.getenv(
|
||||
|
|
@ -93,15 +113,16 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
|
||||
for pool_match in re.finditer(zone_pattern, body_text, re.DOTALL | re.IGNORECASE | re.MULTILINE):
|
||||
global_edges = parse_edges_robust(pool_match.group(1))
|
||||
for e_str in global_edges:
|
||||
parts = e_str.split(':', 1)
|
||||
for edge_info in global_edges:
|
||||
edge_str = edge_info["edge"]
|
||||
parts = edge_str.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
k, t = parts
|
||||
# Diese Kanten werden als "global_pool" markiert für die spätere KI-Prüfung.
|
||||
for ch in chunks:
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
|
||||
|
||||
# 6. De-Duplikation des Pools & Linking
|
||||
# 7. De-Duplikation des Pools & Linking
|
||||
for ch in chunks:
|
||||
seen = set()
|
||||
unique = []
|
||||
|
|
@ -113,6 +134,54 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
unique.append(c)
|
||||
ch.candidate_pool = unique
|
||||
|
||||
# 8. WP-24c v4.2.6: Clean-Context - Entferne Callout-Syntax aus Chunk-Text
|
||||
# WICHTIG: Dies geschieht NACH propagate_section_edges und Candidate Pool Aufbau,
|
||||
# damit Chunk-Attribution erhalten bleibt und Kanten korrekt extrahiert werden.
|
||||
# Hinweis: Callouts können mehrzeilig sein (auch verschachtelt: >>)
|
||||
def remove_callouts_from_text(text: str) -> str:
|
||||
"""Entfernt alle Callout-Zeilen (> [!edge] oder > [!abstract]) aus dem Text."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
i = 0
|
||||
|
||||
callout_start_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
callout_match = callout_start_pattern.match(line)
|
||||
|
||||
if callout_match:
|
||||
# Callout gefunden: Überspringe alle Zeilen des Callout-Blocks
|
||||
leading_gt_count = len(line) - len(line.lstrip('>'))
|
||||
i += 1
|
||||
|
||||
# Überspringe alle Zeilen, die zum Callout gehören
|
||||
while i < len(lines):
|
||||
next_line = lines[i]
|
||||
if not next_line.strip().startswith('>'):
|
||||
break
|
||||
next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
|
||||
if next_leading_gt < leading_gt_count:
|
||||
break
|
||||
i += 1
|
||||
else:
|
||||
# Normale Zeile: Behalte
|
||||
cleaned_lines.append(line)
|
||||
i += 1
|
||||
|
||||
# Normalisiere Leerzeilen (max. 2 aufeinanderfolgende)
|
||||
result = '\n'.join(cleaned_lines)
|
||||
result = re.sub(r'\n\s*\n\s*\n+', '\n\n', result)
|
||||
return result
|
||||
|
||||
for ch in chunks:
|
||||
ch.text = remove_callouts_from_text(ch.text)
|
||||
if ch.window:
|
||||
ch.window = remove_callouts_from_text(ch.window)
|
||||
|
||||
# Verknüpfung der Nachbarschaften für Graph-Traversierung
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
|
|
|
|||
|
|
@ -22,11 +22,13 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
|||
continue
|
||||
|
||||
# Nutzt den robusten Parser aus dem Package
|
||||
edges = parse_edges_robust(ch.text)
|
||||
if edges:
|
||||
# WP-24c v4.2.7: parse_edges_robust gibt jetzt Liste von Dicts zurück
|
||||
edge_infos = parse_edges_robust(ch.text)
|
||||
if edge_infos:
|
||||
if ch.section_path not in section_map:
|
||||
section_map[ch.section_path] = set()
|
||||
section_map[ch.section_path].update(edges)
|
||||
for edge_info in edge_infos:
|
||||
section_map[ch.section_path].add(edge_info["edge"])
|
||||
|
||||
# 2. Injizieren: Kanten in jeden Chunk der Sektion zurückschreiben (Broadcasting)
|
||||
for ch in chunks:
|
||||
|
|
@ -37,7 +39,9 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
|||
|
||||
# Vorhandene Kanten (Typ:Ziel) in DIESEM Chunk ermitteln,
|
||||
# um Dopplungen (z.B. durch Callouts) zu vermeiden.
|
||||
existing_edges = parse_edges_robust(ch.text)
|
||||
# WP-24c v4.2.7: parse_edges_robust gibt jetzt Liste von Dicts zurück
|
||||
existing_edge_infos = parse_edges_robust(ch.text)
|
||||
existing_edges = {ei["edge"] for ei in existing_edge_infos}
|
||||
|
||||
injections = []
|
||||
# Sortierung für deterministische Ergebnisse
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
|
|||
- Keine redundante Kanten-Injektion.
|
||||
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
|
||||
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
|
||||
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
||||
"""
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .chunking_models import RawBlock, Chunk
|
||||
|
|
@ -83,23 +84,46 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
|||
current_meta["title"] = item["meta"].section_title
|
||||
current_meta["path"] = item["meta"].section_path
|
||||
|
||||
# FALL A: HARD SPLIT MODUS
|
||||
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
|
||||
if is_hard_split_mode:
|
||||
# Leere Überschriften (z.B. H1 direkt vor H2) verbleiben am nächsten Chunk
|
||||
if item.get("is_empty", False) and queue:
|
||||
current_chunk_text = (current_chunk_text + "\n\n" + item_text).strip()
|
||||
continue
|
||||
|
||||
combined = (current_chunk_text + "\n\n" + item_text).strip()
|
||||
# Wenn durch Verschmelzung das Limit gesprengt würde, vorher flashen
|
||||
if estimate_tokens(combined) > max_tokens and current_chunk_text:
|
||||
# WP-24c v4.2.5: Bei strict_heading_split: true wird nach JEDER Sektion geflasht
|
||||
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften
|
||||
if current_chunk_text:
|
||||
# Flashe vorherigen Chunk
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
current_chunk_text = item_text
|
||||
else:
|
||||
current_chunk_text = combined
|
||||
current_chunk_text = ""
|
||||
|
||||
# Neue Sektion: Initialisiere Meta
|
||||
current_meta["title"] = item["meta"].section_title
|
||||
current_meta["path"] = item["meta"].section_path
|
||||
|
||||
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
|
||||
# (nur Überschrift, kein Inhalt)
|
||||
if item.get("is_empty", False):
|
||||
# Leere Sektion: Nur Überschrift als Chunk
|
||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
||||
else:
|
||||
# Normale Sektion: Prüfe auf Token-Limit
|
||||
if estimate_tokens(item_text) > max_tokens:
|
||||
# Sektion zu groß: Smart Zerlegung (aber trotzdem in separaten Chunks)
|
||||
sents = split_sentences(item_text)
|
||||
header_prefix = item["meta"].text if item["meta"].kind == "heading" else ""
|
||||
|
||||
take_sents = []; take_len = 0
|
||||
while sents:
|
||||
s = sents.pop(0); slen = estimate_tokens(s)
|
||||
if take_len + slen > target and take_sents:
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
take_sents = [s]; take_len = slen
|
||||
else:
|
||||
take_sents.append(s); take_len += slen
|
||||
|
||||
if take_sents:
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
else:
|
||||
# Sektion passt: Direkt als Chunk
|
||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
||||
|
||||
# Im Hard-Split wird nach jeder Sektion geflasht
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
current_chunk_text = ""
|
||||
continue
|
||||
|
||||
|
|
|
|||
|
|
@ -27,12 +27,31 @@ def load_yaml_config() -> Dict[str, Any]:
|
|||
return data
|
||||
except Exception: return {}
|
||||
|
||||
def get_chunk_config(note_type: str) -> Dict[str, Any]:
|
||||
"""Lädt die Chunking-Strategie basierend auf dem Note-Type."""
|
||||
def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Lädt die Chunking-Strategie basierend auf dem Note-Type.
|
||||
WP-24c v4.2.5: Frontmatter-Override für chunking_profile hat höchste Priorität.
|
||||
|
||||
Args:
|
||||
note_type: Der Typ der Note (z.B. "decision", "experience")
|
||||
frontmatter: Optionales Frontmatter-Dict mit chunking_profile Override
|
||||
|
||||
Returns:
|
||||
Dict mit Chunking-Konfiguration
|
||||
"""
|
||||
full_config = load_yaml_config()
|
||||
profiles = full_config.get("chunking_profiles", {})
|
||||
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
||||
profile_name = type_def.get("chunking_profile") or full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
|
||||
# WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
|
||||
profile_name = None
|
||||
if frontmatter and "chunking_profile" in frontmatter:
|
||||
profile_name = frontmatter.get("chunking_profile")
|
||||
if not profile_name:
|
||||
profile_name = type_def.get("chunking_profile")
|
||||
if not profile_name:
|
||||
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
|
||||
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
||||
if "overlap" in config and isinstance(config["overlap"], list):
|
||||
config["overlap"] = tuple(config["overlap"])
|
||||
|
|
|
|||
|
|
@ -209,6 +209,7 @@ def build_edges_for_note(
|
|||
"""
|
||||
Erzeugt und aggregiert alle Kanten für eine Note.
|
||||
WP-24c v4.2.0: Unterstützt Note-Scope Extraktions-Zonen.
|
||||
WP-24c v4.2.7: Chunk-Attribution für Callouts über candidate_pool mit explicit:callout Provenance.
|
||||
|
||||
Args:
|
||||
note_id: ID der Note
|
||||
|
|
@ -313,11 +314,17 @@ def build_edges_for_note(
|
|||
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||
|
||||
# B. Candidate Pool (WP-15b Validierte KI-Kanten)
|
||||
# WP-24c v4.2.7: Sammle Callout-Keys für Chunk-Attribution
|
||||
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
||||
for cand in pool:
|
||||
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
||||
t, sec = parse_link_target(raw_t, note_id)
|
||||
if t:
|
||||
# WP-24c v4.2.7: Wenn Provenance explicit:callout, füge zu all_chunk_callout_keys hinzu
|
||||
# Dadurch weiß die globale Extraktion, dass diese Kante bereits auf Chunk-Ebene versorgt ist
|
||||
if p == "explicit:callout":
|
||||
all_chunk_callout_keys.add((k, t, sec))
|
||||
|
||||
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||
payload = {
|
||||
"chunk_id": cid,
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ PROVENANCE_PRIORITY = {
|
|||
"explicit:wikilink": 1.00,
|
||||
"inline:rel": 0.95,
|
||||
"callout:edge": 0.90,
|
||||
"explicit:callout": 0.90, # WP-24c v4.2.7: Callout-Kanten aus candidate_pool
|
||||
"semantic_ai": 0.90, # Validierte KI-Kanten
|
||||
"structure:belongs_to": 1.00,
|
||||
"structure:order": 0.95, # next/prev
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user