From 33dff04d4707c3e8d6f00b44e6f57783206c54fc Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 30 Dec 2025 08:22:17 +0100 Subject: [PATCH] Fix v3.3.5: Prevent duplicate Wikilink targets in text by checking for existing references before injecting section edges. Update comments for clarity and maintain consistency in the code structure. --- app/core/chunking/chunking_propagation.py | 21 ++++++----- app/core/ingestion/ingestion_note_payload.py | 39 ++++++++++++-------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/app/core/chunking/chunking_propagation.py b/app/core/chunking/chunking_propagation.py index 099d075..af68442 100644 --- a/app/core/chunking/chunking_propagation.py +++ b/app/core/chunking/chunking_propagation.py @@ -1,9 +1,7 @@ """ FILE: app/core/chunking/chunking_propagation.py DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment). - Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her. -VERSION: 3.3.1 -STATUS: Active + Fix v3.3.5: Erkennt Wikilink-Targets, um Dopplungen zu verhindern. """ from typing import List, Dict, Set from .chunking_models import Chunk @@ -12,7 +10,7 @@ from .chunking_parser import parse_edges_robust def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]: """ Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window. - Dies ist essenziell für die Vektorisierung der Beziehungen. + Verhindert Dopplungen, wenn Kanten bereits via [!edge] Callout vorhanden sind. """ # 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren section_map: Dict[str, Set[str]] = {} # path -> set(kind:target) @@ -39,18 +37,21 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]: injections = [] for e_str in edges_to_add: kind, target = e_str.split(':', 1) - # Nur injizieren, wenn die Kante nicht bereits im Text steht - token = f"[[rel:{kind}|{target}]]" - if token not in ch.text: - injections.append(token) + + # DER FIX: Wir prüfen, ob das Ziel (target) bereits im Text vorkommt. + # Wir suchen nach [[target]] (Callout-Stil) oder |target]] (Rel-Stil). + if f"[[{target}]]" in ch.text or f"|{target}]]" in ch.text: + continue + + injections.append(f"[[rel:{kind}|{target}]]") if injections: - # Physische Anreicherung (Der v3.1.0 Qualitäts-Fix) + # Physische Anreicherung # Triple-Newline für saubere Trennung im Embedding-Fenster block = "\n\n\n" + " ".join(injections) ch.text += block - # ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht! + # Auch ins Window schreiben, da Qdrant hier sucht! if ch.window: ch.window += block else: diff --git a/app/core/ingestion/ingestion_note_payload.py b/app/core/ingestion/ingestion_note_payload.py index d41410b..3df4d4a 100644 --- a/app/core/ingestion/ingestion_note_payload.py +++ b/app/core/ingestion/ingestion_note_payload.py @@ -3,9 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. FEATURES: - Multi-Hash (body/full) für flexible Change Detection. - - Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults. -VERSION: 2.4.4 -STATUS: Active + - Fix v2.4.5: Präzise Hash-Logik für Profil-Änderungen. + - Integration der zentralen Registry (WP-14). """ from __future__ import annotations from typing import Any, Dict, Tuple, Optional @@ -45,14 +44,22 @@ def _compute_hash(content: str) -> str: return hashlib.sha256(content.encode("utf-8")).hexdigest() def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: - """Generiert den Hash-Input-String basierend auf Body oder Metadaten.""" - body = str(n.get("body") or "") + """ + Generiert den Hash-Input-String basierend auf Body oder Metadaten. + Fix: Inkludiert nun alle entscheidungsrelevanten Profil-Parameter. + """ + body = str(n.get("body") or "").strip() if mode == "body": return body if mode == "full": fm = n.get("frontmatter") or {} meta_parts = [] - # Sortierte Liste für deterministische Hashes - for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): + # Wir inkludieren alle Felder, die das Chunking oder Retrieval beeinflussen + keys = [ + "title", "type", "status", "tags", + "chunking_profile", "chunk_profile", + "retriever_weight", "split_level", "strict_heading_split" + ] + for k in sorted(keys): val = fm.get(k) if val is not None: meta_parts.append(f"{k}:{val}") return f"{'|'.join(meta_parts)}||{body}" @@ -79,11 +86,11 @@ def _cfg_defaults(reg: dict) -> dict: def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: """ Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung. - WP-14: Nutzt nun die zentrale Registry für alle Fallbacks. + WP-14: Nutzt die zentrale Registry für alle Fallbacks. """ n = _as_dict(note) - # Nutzt übergebene Registry oder lädt sie global + # Registry & Context Settings reg = kwargs.get("types_cfg") or load_type_registry() hash_source = kwargs.get("hash_source", "parsed") hash_normalize = kwargs.get("hash_normalize", "canonical") @@ -96,7 +103,6 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: ingest_cfg = reg.get("ingestion_settings", {}) # --- retriever_weight Audit --- - # Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)) retriever_weight = fm.get("retriever_weight") if retriever_weight is None: @@ -107,14 +113,13 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: retriever_weight = default_rw # --- chunk_profile Audit --- - # Nutzt nun primär die ingestion_settings aus der Registry chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") if chunk_profile is None: chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile") if chunk_profile is None: chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard")) - # --- edge_defaults --- + # --- edge_defaults Audit --- edge_defaults = fm.get("edge_defaults") if edge_defaults is None: edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", [])) @@ -138,21 +143,23 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: } # --- MULTI-HASH --- - # Generiert Hashes für Change Detection for mode in ["body", "full"]: content = _get_hash_source_content(n, mode) payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content) - # Metadaten Anreicherung + # Metadaten Anreicherung (Tags, Aliases, Zeitstempel) tags = fm.get("tags") or fm.get("keywords") or n.get("tags") if tags: payload["tags"] = _ensure_list(tags) - if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases")) + + aliases = fm.get("aliases") + if aliases: payload["aliases"] = _ensure_list(aliases) for k in ("created", "modified", "date"): v = fm.get(k) or n.get(k) if v: payload[k] = str(v) - if n.get("body"): payload["fulltext"] = str(n["body"]) + if n.get("body"): + payload["fulltext"] = str(n["body"]) # Final JSON Validation Audit json.loads(json.dumps(payload, ensure_ascii=False))