Fix v3.3.5: Prevent duplicate Wikilink targets in text by checking for existing references before injecting section edges. Update comments for clarity and maintain consistency in the code structure.
This commit is contained in:
parent
65d697b7be
commit
33dff04d47
|
|
@ -1,9 +1,7 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_propagation.py
|
FILE: app/core/chunking/chunking_propagation.py
|
||||||
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
||||||
Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her.
|
Fix v3.3.5: Erkennt Wikilink-Targets, um Dopplungen zu verhindern.
|
||||||
VERSION: 3.3.1
|
|
||||||
STATUS: Active
|
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Set
|
from typing import List, Dict, Set
|
||||||
from .chunking_models import Chunk
|
from .chunking_models import Chunk
|
||||||
|
|
@ -12,7 +10,7 @@ from .chunking_parser import parse_edges_robust
|
||||||
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
||||||
Dies ist essenziell für die Vektorisierung der Beziehungen.
|
Verhindert Dopplungen, wenn Kanten bereits via [!edge] Callout vorhanden sind.
|
||||||
"""
|
"""
|
||||||
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
||||||
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
||||||
|
|
@ -39,18 +37,21 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||||
injections = []
|
injections = []
|
||||||
for e_str in edges_to_add:
|
for e_str in edges_to_add:
|
||||||
kind, target = e_str.split(':', 1)
|
kind, target = e_str.split(':', 1)
|
||||||
# Nur injizieren, wenn die Kante nicht bereits im Text steht
|
|
||||||
token = f"[[rel:{kind}|{target}]]"
|
# DER FIX: Wir prüfen, ob das Ziel (target) bereits im Text vorkommt.
|
||||||
if token not in ch.text:
|
# Wir suchen nach [[target]] (Callout-Stil) oder |target]] (Rel-Stil).
|
||||||
injections.append(token)
|
if f"[[{target}]]" in ch.text or f"|{target}]]" in ch.text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
injections.append(f"[[rel:{kind}|{target}]]")
|
||||||
|
|
||||||
if injections:
|
if injections:
|
||||||
# Physische Anreicherung (Der v3.1.0 Qualitäts-Fix)
|
# Physische Anreicherung
|
||||||
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
||||||
block = "\n\n\n" + " ".join(injections)
|
block = "\n\n\n" + " ".join(injections)
|
||||||
ch.text += block
|
ch.text += block
|
||||||
|
|
||||||
# ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht!
|
# Auch ins Window schreiben, da Qdrant hier sucht!
|
||||||
if ch.window:
|
if ch.window:
|
||||||
ch.window += block
|
ch.window += block
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py
|
||||||
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
||||||
FEATURES:
|
FEATURES:
|
||||||
- Multi-Hash (body/full) für flexible Change Detection.
|
- Multi-Hash (body/full) für flexible Change Detection.
|
||||||
- Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
- Fix v2.4.5: Präzise Hash-Logik für Profil-Änderungen.
|
||||||
VERSION: 2.4.4
|
- Integration der zentralen Registry (WP-14).
|
||||||
STATUS: Active
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Any, Dict, Tuple, Optional
|
from typing import Any, Dict, Tuple, Optional
|
||||||
|
|
@ -45,14 +44,22 @@ def _compute_hash(content: str) -> str:
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||||
"""Generiert den Hash-Input-String basierend auf Body oder Metadaten."""
|
"""
|
||||||
body = str(n.get("body") or "")
|
Generiert den Hash-Input-String basierend auf Body oder Metadaten.
|
||||||
|
Fix: Inkludiert nun alle entscheidungsrelevanten Profil-Parameter.
|
||||||
|
"""
|
||||||
|
body = str(n.get("body") or "").strip()
|
||||||
if mode == "body": return body
|
if mode == "body": return body
|
||||||
if mode == "full":
|
if mode == "full":
|
||||||
fm = n.get("frontmatter") or {}
|
fm = n.get("frontmatter") or {}
|
||||||
meta_parts = []
|
meta_parts = []
|
||||||
# Sortierte Liste für deterministische Hashes
|
# Wir inkludieren alle Felder, die das Chunking oder Retrieval beeinflussen
|
||||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
keys = [
|
||||||
|
"title", "type", "status", "tags",
|
||||||
|
"chunking_profile", "chunk_profile",
|
||||||
|
"retriever_weight", "split_level", "strict_heading_split"
|
||||||
|
]
|
||||||
|
for k in sorted(keys):
|
||||||
val = fm.get(k)
|
val = fm.get(k)
|
||||||
if val is not None: meta_parts.append(f"{k}:{val}")
|
if val is not None: meta_parts.append(f"{k}:{val}")
|
||||||
return f"{'|'.join(meta_parts)}||{body}"
|
return f"{'|'.join(meta_parts)}||{body}"
|
||||||
|
|
@ -79,11 +86,11 @@ def _cfg_defaults(reg: dict) -> dict:
|
||||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
||||||
WP-14: Nutzt nun die zentrale Registry für alle Fallbacks.
|
WP-14: Nutzt die zentrale Registry für alle Fallbacks.
|
||||||
"""
|
"""
|
||||||
n = _as_dict(note)
|
n = _as_dict(note)
|
||||||
|
|
||||||
# Nutzt übergebene Registry oder lädt sie global
|
# Registry & Context Settings
|
||||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
hash_source = kwargs.get("hash_source", "parsed")
|
||||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||||
|
|
@ -96,7 +103,6 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
ingest_cfg = reg.get("ingestion_settings", {})
|
ingest_cfg = reg.get("ingestion_settings", {})
|
||||||
|
|
||||||
# --- retriever_weight Audit ---
|
# --- retriever_weight Audit ---
|
||||||
# Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var
|
|
||||||
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
||||||
retriever_weight = fm.get("retriever_weight")
|
retriever_weight = fm.get("retriever_weight")
|
||||||
if retriever_weight is None:
|
if retriever_weight is None:
|
||||||
|
|
@ -107,14 +113,13 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
retriever_weight = default_rw
|
retriever_weight = default_rw
|
||||||
|
|
||||||
# --- chunk_profile Audit ---
|
# --- chunk_profile Audit ---
|
||||||
# Nutzt nun primär die ingestion_settings aus der Registry
|
|
||||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
||||||
|
|
||||||
# --- edge_defaults ---
|
# --- edge_defaults Audit ---
|
||||||
edge_defaults = fm.get("edge_defaults")
|
edge_defaults = fm.get("edge_defaults")
|
||||||
if edge_defaults is None:
|
if edge_defaults is None:
|
||||||
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
||||||
|
|
@ -138,21 +143,23 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- MULTI-HASH ---
|
# --- MULTI-HASH ---
|
||||||
# Generiert Hashes für Change Detection
|
|
||||||
for mode in ["body", "full"]:
|
for mode in ["body", "full"]:
|
||||||
content = _get_hash_source_content(n, mode)
|
content = _get_hash_source_content(n, mode)
|
||||||
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
||||||
|
|
||||||
# Metadaten Anreicherung
|
# Metadaten Anreicherung (Tags, Aliases, Zeitstempel)
|
||||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||||
if tags: payload["tags"] = _ensure_list(tags)
|
if tags: payload["tags"] = _ensure_list(tags)
|
||||||
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
|
||||||
|
aliases = fm.get("aliases")
|
||||||
|
if aliases: payload["aliases"] = _ensure_list(aliases)
|
||||||
|
|
||||||
for k in ("created", "modified", "date"):
|
for k in ("created", "modified", "date"):
|
||||||
v = fm.get(k) or n.get(k)
|
v = fm.get(k) or n.get(k)
|
||||||
if v: payload[k] = str(v)
|
if v: payload[k] = str(v)
|
||||||
|
|
||||||
if n.get("body"): payload["fulltext"] = str(n["body"])
|
if n.get("body"):
|
||||||
|
payload["fulltext"] = str(n["body"])
|
||||||
|
|
||||||
# Final JSON Validation Audit
|
# Final JSON Validation Audit
|
||||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user