diff --git a/app/core/ingestion/ingestion_note_payload.py b/app/core/ingestion/ingestion_note_payload.py index 045efdd..504c743 100644 --- a/app/core/ingestion/ingestion_note_payload.py +++ b/app/core/ingestion/ingestion_note_payload.py @@ -1,8 +1,11 @@ """ FILE: app/core/ingestion/ingestion_note_payload.py DESCRIPTION: Baut das JSON-Objekt für mindnet_notes. -FEATURES: Multi-Hash (body/full), Config-Fix für chunking_profile. -VERSION: 2.4.0 +FEATURES: + - Multi-Hash (body/full) für flexible Change Detection. + - Fix v2.4.2: edge_defaults Logik wiederhergestellt (DoD-Korrektur). +VERSION: 2.4.2 +STATUS: Active """ from __future__ import annotations from typing import Any, Dict, Tuple, Optional @@ -12,7 +15,12 @@ import pathlib import hashlib import yaml +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + def _as_dict(x) -> Dict[str, Any]: + """Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen.""" if isinstance(x, dict): return dict(x) out: Dict[str, Any] = {} for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"): @@ -23,29 +31,53 @@ def _as_dict(x) -> Dict[str, Any]: return out def _ensure_list(x) -> list: + """Sichert, dass das Ergebnis eine Liste von Strings ist.""" if x is None: return [] if isinstance(x, list): return [str(i) for i in x] if isinstance(x, (set, tuple)): return [str(i) for i in x] return [str(x)] def _compute_hash(content: str) -> str: + """Berechnet einen SHA-256 Hash.""" if not content: return "" return hashlib.sha256(content.encode("utf-8")).hexdigest() def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: + """Stellt den zu hashenden Content deterministisch zusammen.""" body = str(n.get("body") or "") if mode == "body": return body if mode == "full": fm = n.get("frontmatter") or {} meta_parts = [] + # Steuernde Metadaten für Change Detection for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): val = fm.get(k) if val is not None: meta_parts.append(f"{k}:{val}") - return f" {'|'.join(meta_parts)}||{body}" + return f"{'|'.join(meta_parts)}||{body}" return body +def _cfg_for_type(note_type: str, reg: dict) -> dict: + """Holt die typ-spezifische Konfiguration.""" + if not isinstance(reg, dict): return {} + types = reg.get("types") if isinstance(reg.get("types"), dict) else reg + return types.get(note_type, {}) if isinstance(types, dict) else {} + +def _cfg_defaults(reg: dict) -> dict: + """Holt die globalen Default-Werte aus der Registry.""" + if not isinstance(reg, dict): return {} + for key in ("defaults", "default", "global"): + v = reg.get(key) + if isinstance(v, dict): return v + return {} + +# --------------------------------------------------------------------------- +# Haupt-API +# --------------------------------------------------------------------------- + def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: - """Baut das Note-Payload inklusive Multi-Hash.""" + """ + Baut das Note-Payload inklusive Multi-Hash und edge_defaults. + """ n = _as_dict(note) reg = kwargs.get("types_cfg") or {} hash_source = kwargs.get("hash_source", "parsed") @@ -54,24 +86,48 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: fm = n.get("frontmatter") or {} note_type = str(fm.get("type") or n.get("type") or "concept") - # Weights & Profiles - retriever_weight = fm.get("retriever_weight", 1.0) - chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard" + cfg_type = _cfg_for_type(note_type, reg) + cfg_def = _cfg_defaults(reg) + + # --- retriever_weight --- + retriever_weight = fm.get("retriever_weight") + if retriever_weight is None: + retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", 1.0)) + try: retriever_weight = float(retriever_weight) + except: retriever_weight = 1.0 + + # --- chunk_profile --- + chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") + if chunk_profile is None: + chunk_profile = cfg_type.get("chunking_profile", cfg_def.get("chunking_profile", "sliding_standard")) + + # --- edge_defaults (WIEDERHERGESTELLT) --- + edge_defaults = fm.get("edge_defaults") + if edge_defaults is None: + edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", [])) + edge_defaults = _ensure_list(edge_defaults) + + # --- Basis-Metadaten --- + note_id = n.get("note_id") or n.get("id") or fm.get("id") + title = n.get("title") or fm.get("title") or "" payload: Dict[str, Any] = { - "note_id": n.get("note_id") or n.get("id") or fm.get("id"), - "title": n.get("title") or fm.get("title") or "", + "note_id": note_id, + "title": title, "type": note_type, - "path": str(n.get("path") or kwargs.get("path") or ""), - "retriever_weight": float(retriever_weight), + "path": str(n.get("path") or kwargs.get("file_path") or ""), + "retriever_weight": retriever_weight, "chunk_profile": chunk_profile, + "edge_defaults": edge_defaults, # Feld jetzt wieder enthalten "hashes": {} } + # --- MULTI-HASH --- for mode in ["body", "full"]: key = f"{mode}:{hash_source}:{hash_normalize}" payload["hashes"][key] = _compute_hash(_get_hash_source_content(n, mode)) + # Metadaten-Felder if fm.get("tags") or n.get("tags"): payload["tags"] = _ensure_list(fm.get("tags") or n.get("tags")) if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases")) for k in ("created", "modified", "date"):