WP4d #16
|
|
@ -1,9 +1,7 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_propagation.py
|
FILE: app/core/chunking/chunking_propagation.py
|
||||||
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
||||||
Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her.
|
Fix v3.3.5: Erkennt Wikilink-Targets, um Dopplungen zu verhindern.
|
||||||
VERSION: 3.3.1
|
|
||||||
STATUS: Active
|
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Set
|
from typing import List, Dict, Set
|
||||||
from .chunking_models import Chunk
|
from .chunking_models import Chunk
|
||||||
|
|
@ -12,7 +10,7 @@ from .chunking_parser import parse_edges_robust
|
||||||
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
||||||
Dies ist essenziell für die Vektorisierung der Beziehungen.
|
Verhindert Dopplungen, wenn Kanten bereits via [!edge] Callout vorhanden sind.
|
||||||
"""
|
"""
|
||||||
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
||||||
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
||||||
|
|
@ -39,18 +37,21 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||||
injections = []
|
injections = []
|
||||||
for e_str in edges_to_add:
|
for e_str in edges_to_add:
|
||||||
kind, target = e_str.split(':', 1)
|
kind, target = e_str.split(':', 1)
|
||||||
# Nur injizieren, wenn die Kante nicht bereits im Text steht
|
|
||||||
token = f"[[rel:{kind}|{target}]]"
|
# DER FIX: Wir prüfen, ob das Ziel (target) bereits im Text vorkommt.
|
||||||
if token not in ch.text:
|
# Wir suchen nach [[target]] (Callout-Stil) oder |target]] (Rel-Stil).
|
||||||
injections.append(token)
|
if f"[[{target}]]" in ch.text or f"|{target}]]" in ch.text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
injections.append(f"[[rel:{kind}|{target}]]")
|
||||||
|
|
||||||
if injections:
|
if injections:
|
||||||
# Physische Anreicherung (Der v3.1.0 Qualitäts-Fix)
|
# Physische Anreicherung
|
||||||
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
||||||
block = "\n\n\n" + " ".join(injections)
|
block = "\n\n\n" + " ".join(injections)
|
||||||
ch.text += block
|
ch.text += block
|
||||||
|
|
||||||
# ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht!
|
# Auch ins Window schreiben, da Qdrant hier sucht!
|
||||||
if ch.window:
|
if ch.window:
|
||||||
ch.window += block
|
ch.window += block
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py
|
||||||
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
||||||
FEATURES:
|
FEATURES:
|
||||||
- Multi-Hash (body/full) für flexible Change Detection.
|
- Multi-Hash (body/full) für flexible Change Detection.
|
||||||
- Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
- Fix v2.4.5: Präzise Hash-Logik für Profil-Änderungen.
|
||||||
VERSION: 2.4.4
|
- Integration der zentralen Registry (WP-14).
|
||||||
STATUS: Active
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Any, Dict, Tuple, Optional
|
from typing import Any, Dict, Tuple, Optional
|
||||||
|
|
@ -45,14 +44,22 @@ def _compute_hash(content: str) -> str:
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||||
"""Generiert den Hash-Input-String basierend auf Body oder Metadaten."""
|
"""
|
||||||
body = str(n.get("body") or "")
|
Generiert den Hash-Input-String basierend auf Body oder Metadaten.
|
||||||
|
Fix: Inkludiert nun alle entscheidungsrelevanten Profil-Parameter.
|
||||||
|
"""
|
||||||
|
body = str(n.get("body") or "").strip()
|
||||||
if mode == "body": return body
|
if mode == "body": return body
|
||||||
if mode == "full":
|
if mode == "full":
|
||||||
fm = n.get("frontmatter") or {}
|
fm = n.get("frontmatter") or {}
|
||||||
meta_parts = []
|
meta_parts = []
|
||||||
# Sortierte Liste für deterministische Hashes
|
# Wir inkludieren alle Felder, die das Chunking oder Retrieval beeinflussen
|
||||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
keys = [
|
||||||
|
"title", "type", "status", "tags",
|
||||||
|
"chunking_profile", "chunk_profile",
|
||||||
|
"retriever_weight", "split_level", "strict_heading_split"
|
||||||
|
]
|
||||||
|
for k in sorted(keys):
|
||||||
val = fm.get(k)
|
val = fm.get(k)
|
||||||
if val is not None: meta_parts.append(f"{k}:{val}")
|
if val is not None: meta_parts.append(f"{k}:{val}")
|
||||||
return f"{'|'.join(meta_parts)}||{body}"
|
return f"{'|'.join(meta_parts)}||{body}"
|
||||||
|
|
@ -79,11 +86,11 @@ def _cfg_defaults(reg: dict) -> dict:
|
||||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
||||||
WP-14: Nutzt nun die zentrale Registry für alle Fallbacks.
|
WP-14: Nutzt die zentrale Registry für alle Fallbacks.
|
||||||
"""
|
"""
|
||||||
n = _as_dict(note)
|
n = _as_dict(note)
|
||||||
|
|
||||||
# Nutzt übergebene Registry oder lädt sie global
|
# Registry & Context Settings
|
||||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
hash_source = kwargs.get("hash_source", "parsed")
|
||||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||||
|
|
@ -96,7 +103,6 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
ingest_cfg = reg.get("ingestion_settings", {})
|
ingest_cfg = reg.get("ingestion_settings", {})
|
||||||
|
|
||||||
# --- retriever_weight Audit ---
|
# --- retriever_weight Audit ---
|
||||||
# Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var
|
|
||||||
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
||||||
retriever_weight = fm.get("retriever_weight")
|
retriever_weight = fm.get("retriever_weight")
|
||||||
if retriever_weight is None:
|
if retriever_weight is None:
|
||||||
|
|
@ -107,14 +113,13 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
retriever_weight = default_rw
|
retriever_weight = default_rw
|
||||||
|
|
||||||
# --- chunk_profile Audit ---
|
# --- chunk_profile Audit ---
|
||||||
# Nutzt nun primär die ingestion_settings aus der Registry
|
|
||||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
||||||
|
|
||||||
# --- edge_defaults ---
|
# --- edge_defaults Audit ---
|
||||||
edge_defaults = fm.get("edge_defaults")
|
edge_defaults = fm.get("edge_defaults")
|
||||||
if edge_defaults is None:
|
if edge_defaults is None:
|
||||||
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
||||||
|
|
@ -138,21 +143,23 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- MULTI-HASH ---
|
# --- MULTI-HASH ---
|
||||||
# Generiert Hashes für Change Detection
|
|
||||||
for mode in ["body", "full"]:
|
for mode in ["body", "full"]:
|
||||||
content = _get_hash_source_content(n, mode)
|
content = _get_hash_source_content(n, mode)
|
||||||
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
||||||
|
|
||||||
# Metadaten Anreicherung
|
# Metadaten Anreicherung (Tags, Aliases, Zeitstempel)
|
||||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||||
if tags: payload["tags"] = _ensure_list(tags)
|
if tags: payload["tags"] = _ensure_list(tags)
|
||||||
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
|
||||||
|
aliases = fm.get("aliases")
|
||||||
|
if aliases: payload["aliases"] = _ensure_list(aliases)
|
||||||
|
|
||||||
for k in ("created", "modified", "date"):
|
for k in ("created", "modified", "date"):
|
||||||
v = fm.get(k) or n.get(k)
|
v = fm.get(k) or n.get(k)
|
||||||
if v: payload[k] = str(v)
|
if v: payload[k] = str(v)
|
||||||
|
|
||||||
if n.get("body"): payload["fulltext"] = str(n["body"])
|
if n.get("body"):
|
||||||
|
payload["fulltext"] = str(n["body"])
|
||||||
|
|
||||||
# Final JSON Validation Audit
|
# Final JSON Validation Audit
|
||||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user