Fix v3.3.5: Prevent duplicate Wikilink targets in text by checking for existing references before injecting section edges. Update comments for clarity and maintain consistency in the code structure.

This commit is contained in:
Lars 2025-12-30 08:22:17 +01:00
parent 65d697b7be
commit 33dff04d47
2 changed files with 34 additions and 26 deletions

View File

@ -1,9 +1,7 @@
""" """
FILE: app/core/chunking/chunking_propagation.py FILE: app/core/chunking/chunking_propagation.py
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment). DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her. Fix v3.3.5: Erkennt Wikilink-Targets, um Dopplungen zu verhindern.
VERSION: 3.3.1
STATUS: Active
""" """
from typing import List, Dict, Set from typing import List, Dict, Set
from .chunking_models import Chunk from .chunking_models import Chunk
@ -12,7 +10,7 @@ from .chunking_parser import parse_edges_robust
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]: def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
""" """
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window. Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
Dies ist essenziell für die Vektorisierung der Beziehungen. Verhindert Dopplungen, wenn Kanten bereits via [!edge] Callout vorhanden sind.
""" """
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren # 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target) section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
@ -39,18 +37,21 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
injections = [] injections = []
for e_str in edges_to_add: for e_str in edges_to_add:
kind, target = e_str.split(':', 1) kind, target = e_str.split(':', 1)
# Nur injizieren, wenn die Kante nicht bereits im Text steht
token = f"[[rel:{kind}|{target}]]" # DER FIX: Wir prüfen, ob das Ziel (target) bereits im Text vorkommt.
if token not in ch.text: # Wir suchen nach [[target]] (Callout-Stil) oder |target]] (Rel-Stil).
injections.append(token) if f"[[{target}]]" in ch.text or f"|{target}]]" in ch.text:
continue
injections.append(f"[[rel:{kind}|{target}]]")
if injections: if injections:
# Physische Anreicherung (Der v3.1.0 Qualitäts-Fix) # Physische Anreicherung
# Triple-Newline für saubere Trennung im Embedding-Fenster # Triple-Newline für saubere Trennung im Embedding-Fenster
block = "\n\n\n" + " ".join(injections) block = "\n\n\n" + " ".join(injections)
ch.text += block ch.text += block
# ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht! # Auch ins Window schreiben, da Qdrant hier sucht!
if ch.window: if ch.window:
ch.window += block ch.window += block
else: else:

View File

@ -3,9 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
FEATURES: FEATURES:
- Multi-Hash (body/full) für flexible Change Detection. - Multi-Hash (body/full) für flexible Change Detection.
- Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults. - Fix v2.4.5: Präzise Hash-Logik für Profil-Änderungen.
VERSION: 2.4.4 - Integration der zentralen Registry (WP-14).
STATUS: Active
""" """
from __future__ import annotations from __future__ import annotations
from typing import Any, Dict, Tuple, Optional from typing import Any, Dict, Tuple, Optional
@ -45,14 +44,22 @@ def _compute_hash(content: str) -> str:
return hashlib.sha256(content.encode("utf-8")).hexdigest() return hashlib.sha256(content.encode("utf-8")).hexdigest()
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
"""Generiert den Hash-Input-String basierend auf Body oder Metadaten.""" """
body = str(n.get("body") or "") Generiert den Hash-Input-String basierend auf Body oder Metadaten.
Fix: Inkludiert nun alle entscheidungsrelevanten Profil-Parameter.
"""
body = str(n.get("body") or "").strip()
if mode == "body": return body if mode == "body": return body
if mode == "full": if mode == "full":
fm = n.get("frontmatter") or {} fm = n.get("frontmatter") or {}
meta_parts = [] meta_parts = []
# Sortierte Liste für deterministische Hashes # Wir inkludieren alle Felder, die das Chunking oder Retrieval beeinflussen
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): keys = [
"title", "type", "status", "tags",
"chunking_profile", "chunk_profile",
"retriever_weight", "split_level", "strict_heading_split"
]
for k in sorted(keys):
val = fm.get(k) val = fm.get(k)
if val is not None: meta_parts.append(f"{k}:{val}") if val is not None: meta_parts.append(f"{k}:{val}")
return f"{'|'.join(meta_parts)}||{body}" return f"{'|'.join(meta_parts)}||{body}"
@ -79,11 +86,11 @@ def _cfg_defaults(reg: dict) -> dict:
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
""" """
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung. Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
WP-14: Nutzt nun die zentrale Registry für alle Fallbacks. WP-14: Nutzt die zentrale Registry für alle Fallbacks.
""" """
n = _as_dict(note) n = _as_dict(note)
# Nutzt übergebene Registry oder lädt sie global # Registry & Context Settings
reg = kwargs.get("types_cfg") or load_type_registry() reg = kwargs.get("types_cfg") or load_type_registry()
hash_source = kwargs.get("hash_source", "parsed") hash_source = kwargs.get("hash_source", "parsed")
hash_normalize = kwargs.get("hash_normalize", "canonical") hash_normalize = kwargs.get("hash_normalize", "canonical")
@ -96,7 +103,6 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
ingest_cfg = reg.get("ingestion_settings", {}) ingest_cfg = reg.get("ingestion_settings", {})
# --- retriever_weight Audit --- # --- retriever_weight Audit ---
# Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)) default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
retriever_weight = fm.get("retriever_weight") retriever_weight = fm.get("retriever_weight")
if retriever_weight is None: if retriever_weight is None:
@ -107,14 +113,13 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
retriever_weight = default_rw retriever_weight = default_rw
# --- chunk_profile Audit --- # --- chunk_profile Audit ---
# Nutzt nun primär die ingestion_settings aus der Registry
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
if chunk_profile is None: if chunk_profile is None:
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile") chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
if chunk_profile is None: if chunk_profile is None:
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard")) chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
# --- edge_defaults --- # --- edge_defaults Audit ---
edge_defaults = fm.get("edge_defaults") edge_defaults = fm.get("edge_defaults")
if edge_defaults is None: if edge_defaults is None:
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", [])) edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
@ -138,21 +143,23 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
} }
# --- MULTI-HASH --- # --- MULTI-HASH ---
# Generiert Hashes für Change Detection
for mode in ["body", "full"]: for mode in ["body", "full"]:
content = _get_hash_source_content(n, mode) content = _get_hash_source_content(n, mode)
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content) payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
# Metadaten Anreicherung # Metadaten Anreicherung (Tags, Aliases, Zeitstempel)
tags = fm.get("tags") or fm.get("keywords") or n.get("tags") tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
if tags: payload["tags"] = _ensure_list(tags) if tags: payload["tags"] = _ensure_list(tags)
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
aliases = fm.get("aliases")
if aliases: payload["aliases"] = _ensure_list(aliases)
for k in ("created", "modified", "date"): for k in ("created", "modified", "date"):
v = fm.get(k) or n.get(k) v = fm.get(k) or n.get(k)
if v: payload[k] = str(v) if v: payload[k] = str(v)
if n.get("body"): payload["fulltext"] = str(n["body"]) if n.get("body"):
payload["fulltext"] = str(n["body"])
# Final JSON Validation Audit # Final JSON Validation Audit
json.loads(json.dumps(payload, ensure_ascii=False)) json.loads(json.dumps(payload, ensure_ascii=False))