diff --git a/app/core/ingestion.py b/app/core/ingestion.py index ab2e46a..9b96478 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -1,7 +1,7 @@ """ FILE: app/core/ingestion.py -DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, prüft Hashes (Change Detection), zerlegt in Chunks und schreibt in Qdrant. -VERSION: 2.5.3 (Fix: Hash-Mode Full for Metadata Detection) +DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, steuert Change-Detection (via ENV) und schreibt in Qdrant. +VERSION: 2.6.0 (Feat: Flexible Change Detection & Full Logic Restoration) STATUS: Active DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client EXTERNAL_CONFIG: config/types.yaml @@ -56,9 +56,11 @@ def resolve_note_type(requested: Optional[str], reg: dict) -> str: def effective_chunk_profile(note_type: str, reg: dict) -> str: t_cfg = reg.get("types", {}).get(note_type, {}) - if t_cfg and t_cfg.get("chunk_profile"): - return t_cfg.get("chunk_profile") - return reg.get("defaults", {}).get("chunk_profile", "default") + # FIX: Konsistenz mit note_payload.py - Prüfe erst den korrekten Key + if t_cfg: + if t_cfg.get("chunking_profile"): return t_cfg.get("chunking_profile") + if t_cfg.get("chunk_profile"): return t_cfg.get("chunk_profile") # Legacy + return reg.get("defaults", {}).get("chunking_profile", "sliding_standard") def effective_retriever_weight(note_type: str, reg: dict) -> float: t_cfg = reg.get("types", {}).get(note_type, {}) @@ -79,6 +81,9 @@ class IngestionService: self.registry = load_type_registry() self.embedder = EmbeddingsClient() + # ACTIVE HASH MODE aus ENV lesen (Default: full) + self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full") + try: ensure_collections(self.client, self.prefix, self.dim) ensure_payload_indexes(self.client, self.prefix) @@ -93,8 +98,8 @@ class IngestionService: apply: bool = False, purge_before: bool = False, note_scope_refs: bool = False, - # FIX: Default auf "full", damit Metadata-Änderungen erkannt werden - hash_mode: str = "full", + # Hash-Mode wird nicht mehr übergeben, sondern via ENV gesteuert. + # Source und Normalize bleiben konfigurierbar. hash_source: str = "parsed", hash_normalize: str = "canonical" ) -> Dict[str, Any]: @@ -132,11 +137,11 @@ class IngestionService: fm["retriever_weight"] = float(weight) # 3. Build Note Payload + # Ruft make_note_payload auf, welches JETZT alle Hash-Varianten berechnet. try: note_pl = make_note_payload( parsed, vault_root=vault_root, - hash_mode=hash_mode, hash_normalize=hash_normalize, hash_source=hash_source, file_path=file_path @@ -150,25 +155,31 @@ class IngestionService: logger.error(f"Payload build failed: {e}") return {**result, "error": f"Payload build failed: {str(e)}"} - # 4. Change Detection (Updated Logic) + # 4. Change Detection (Updated Logic with ENV Strategy) old_payload = None if not force_replace: old_payload = self._fetch_note_payload(note_id) has_old = old_payload is not None - key_current = f"{hash_mode}:{hash_source}:{hash_normalize}" - # Robustere Abfrage: Falls 'hashes' im Payload fehlt, None zurückgeben + # Wir bauen den Key basierend auf der ENV-Einstellung + check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}" + old_hashes = (old_payload or {}).get("hashes") + + # Fallback Logik für alte Daten if isinstance(old_hashes, dict): - old_hash = old_hashes.get(key_current) + old_hash = old_hashes.get(check_key) + elif isinstance(old_hashes, str): + # Sehr alte Legacy Daten hatten Hash direkt als String (meist Body) + # Wenn wir im Body-Modus sind, ist das okay, sonst Force Update + old_hash = old_hashes if self.active_hash_mode == "body" else None else: - # Fallback für Legacy Payloads ohne Hash-Dict old_hash = None - new_hash = note_pl.get("hashes", {}).get(key_current) + new_hash = note_pl.get("hashes", {}).get(check_key) - # Wenn wir keinen alten Hash haben (z.B. neues Hash-Schema "full"), erzwingen wir Update + # Vergleich hash_changed = (old_hash != new_hash) chunks_missing, edges_missing = self._artifacts_missing(note_id) diff --git a/app/core/note_payload.py b/app/core/note_payload.py index 8d57735..957a97e 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,7 +1,10 @@ """ FILE: app/core/note_payload.py -DESCRIPTION: Baut das JSON-Objekt für 'mindnet_notes'. Inkludiert Hash-Berechnung für Change Detection. -VERSION: 2.2.0 (Fix: Missing Hash Calculation) +DESCRIPTION: Baut das JSON-Objekt. +FEATURES: + 1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection. + 2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile'). +VERSION: 2.3.0 STATUS: Active DEPENDENCIES: yaml, os, json, pathlib, hashlib EXTERNAL_CONFIG: config/types.yaml @@ -77,7 +80,7 @@ def _ensure_list(x) -> list: return [str(i) for i in x] return [str(x)] -# --- NEW: Hash Logic --- +# --- Hash Logic --- def _compute_hash(content: str) -> str: """Berechnet einen SHA-256 Hash für den gegebenen String.""" if not content: @@ -87,8 +90,6 @@ def _compute_hash(content: str) -> str: def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: """ Stellt den String zusammen, der gehasht werden soll. - mode="body": Nur der Textinhalt. - mode="full": Textinhalt + relevante Metadaten (Titel, Typ, Tags, Status). """ body = str(n.get("body") or "") @@ -96,13 +97,12 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: return body if mode == "full": - # Wir nehmen die wichtigsten strukturellen Metadaten dazu. - # Wichtig: Sortierte Keys für deterministisches Verhalten! fm = n.get("frontmatter") or {} - - # Extrahieren relevanter Meta-Felder, die Änderungen auslösen sollen + # Wichtig: Sortierte Keys für deterministisches Verhalten! + # Wir nehmen alle steuernden Metadaten auf meta_parts = [] - for k in sorted(["title", "type", "status", "tags", "chunk_profile"]): + # Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden + for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): val = fm.get(k) if val is not None: meta_parts.append(f"{k}:{val}") @@ -157,14 +157,13 @@ def _cfg_defaults(reg: dict) -> dict: def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: """ Baut das Note-Payload für mindnet_notes auf. - Inkludiert Hash-Berechnung. + Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups. """ n = _as_dict(note) path_arg, types_cfg_explicit = _pick_args(*args, **kwargs) reg = _load_types_config(types_cfg_explicit) - # Hash Config extrahieren (Defaults wie in ingestion.py) - hash_mode = kwargs.get("hash_mode", "full") + # Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide') hash_source = kwargs.get("hash_source", "parsed") hash_normalize = kwargs.get("hash_normalize", "canonical") @@ -188,18 +187,21 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: except Exception: retriever_weight = default_rw - # --- chunk_profile --- - chunk_profile = fm.get("chunk_profile") + # --- chunk_profile (FIXED LOGIC) --- + # 1. Frontmatter Override (beide Schreibweisen erlaubt) + chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") + + # 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml) if chunk_profile is None: - chunk_profile = cfg_type.get( - "chunk_profile", - cfg_def.get( - "chunk_profile", - os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"), - ), - ) - if not isinstance(chunk_profile, str): - chunk_profile = "medium" + chunk_profile = cfg_type.get("chunking_profile") + + # 3. Default Config (Fallback auf sliding_standard statt medium) + if chunk_profile is None: + chunk_profile = cfg_def.get("chunking_profile", "sliding_standard") + + # 4. Safety Fallback + if not isinstance(chunk_profile, str) or not chunk_profile: + chunk_profile = "sliding_standard" # --- edge_defaults --- edge_defaults = fm.get("edge_defaults") @@ -225,20 +227,20 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: "retriever_weight": retriever_weight, "chunk_profile": chunk_profile, "edge_defaults": edge_defaults, + "hashes": {} # Init Hash Dict } - # --- FIX: Hash Calculation --- - # Wir berechnen den Hash basierend auf dem Modus - content_to_hash = _get_hash_source_content(n, hash_mode) - computed_hash = _compute_hash(content_to_hash) + # --- MULTI-HASH CALCULATION (Strategy Decoupling) --- + # Wir berechnen immer BEIDE Strategien und speichern sie. + # ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird. + modes_to_calc = ["body", "full"] - # Der Key muss exakt dem entsprechen, was ingestion.py erwartet: - # key_current = f"{hash_mode}:{hash_source}:{hash_normalize}" - hash_key = f"{hash_mode}:{hash_source}:{hash_normalize}" - - payload["hashes"] = { - hash_key: computed_hash - } + for mode in modes_to_calc: + content_to_hash = _get_hash_source_content(n, mode) + computed_hash = _compute_hash(content_to_hash) + # Key Schema: mode:source:normalize (z.B. "full:parsed:canonical") + key = f"{mode}:{hash_source}:{hash_normalize}" + payload["hashes"][key] = computed_hash # Tags / Keywords tags = fm.get("tags") or fm.get("keywords") or n.get("tags")