2025-12-16 18:55:45 +01:00
2 changed files with 63 additions and 50 deletions
--- a/app/core/ingestion.py
+++ b/app/core/ingestion.py
@ -1,7 +1,7 @@
 """
 FILE: app/core/ingestion.py
-DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, prüft Hashes (Change Detection), zerlegt in Chunks und schreibt in Qdrant.
-VERSION: 2.5.3 (Fix: Hash-Mode Full for Metadata Detection)
+DESCRIPTION: Haupt-Ingestion-Logik. Liest Markdown, steuert Change-Detection (via ENV) und schreibt in Qdrant.
+VERSION: 2.6.0 (Feat: Flexible Change Detection & Full Logic Restoration)
 STATUS: Active
 DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client
 EXTERNAL_CONFIG: config/types.yaml
@ -56,9 +56,11 @@ def resolve_note_type(requested: Optional[str], reg: dict) -> str:

 def effective_chunk_profile(note_type: str, reg: dict) -> str:
    t_cfg = reg.get("types", {}).get(note_type, {})
-    if t_cfg and t_cfg.get("chunk_profile"):
-        return t_cfg.get("chunk_profile")
-    return reg.get("defaults", {}).get("chunk_profile", "default")
+    # FIX: Konsistenz mit note_payload.py - Prüfe erst den korrekten Key
+    if t_cfg:
+        if t_cfg.get("chunking_profile"): return t_cfg.get("chunking_profile")
+        if t_cfg.get("chunk_profile"): return t_cfg.get("chunk_profile") # Legacy
+    return reg.get("defaults", {}).get("chunking_profile", "sliding_standard")

 def effective_retriever_weight(note_type: str, reg: dict) -> float:
    t_cfg = reg.get("types", {}).get(note_type, {})
@ -79,6 +81,9 @@ class IngestionService:
        self.registry = load_type_registry()
        self.embedder = EmbeddingsClient()
        
+        # ACTIVE HASH MODE aus ENV lesen (Default: full)
+        self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
+        
        try:
            ensure_collections(self.client, self.prefix, self.dim)
            ensure_payload_indexes(self.client, self.prefix)
@ -93,8 +98,8 @@ class IngestionService:
        apply: bool = False,
        purge_before: bool = False,
        note_scope_refs: bool = False,
-        # FIX: Default auf "full", damit Metadata-Änderungen erkannt werden
-        hash_mode: str = "full", 
+        # Hash-Mode wird nicht mehr übergeben, sondern via ENV gesteuert.
+        # Source und Normalize bleiben konfigurierbar.
        hash_source: str = "parsed",
        hash_normalize: str = "canonical"
    ) -> Dict[str, Any]:
@ -132,11 +137,11 @@ class IngestionService:
        fm["retriever_weight"] = float(weight)

        # 3. Build Note Payload
+        # Ruft make_note_payload auf, welches JETZT alle Hash-Varianten berechnet.
        try:
            note_pl = make_note_payload(
                parsed,
                vault_root=vault_root,
-                hash_mode=hash_mode,
                hash_normalize=hash_normalize,
                hash_source=hash_source,
                file_path=file_path
@ -150,25 +155,31 @@ class IngestionService:
             logger.error(f"Payload build failed: {e}")
             return {**result, "error": f"Payload build failed: {str(e)}"}

-        # 4. Change Detection (Updated Logic)
+        # 4. Change Detection (Updated Logic with ENV Strategy)
        old_payload = None
        if not force_replace:
            old_payload = self._fetch_note_payload(note_id)
        
        has_old = old_payload is not None
-        key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
        
-        # Robustere Abfrage: Falls 'hashes' im Payload fehlt, None zurückgeben
+        # Wir bauen den Key basierend auf der ENV-Einstellung
+        check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
+        
        old_hashes = (old_payload or {}).get("hashes")
+        
+        # Fallback Logik für alte Daten
        if isinstance(old_hashes, dict):
-            old_hash = old_hashes.get(key_current)
+            old_hash = old_hashes.get(check_key)
+        elif isinstance(old_hashes, str):
+            # Sehr alte Legacy Daten hatten Hash direkt als String (meist Body)
+            # Wenn wir im Body-Modus sind, ist das okay, sonst Force Update
+            old_hash = old_hashes if self.active_hash_mode == "body" else None
        else:
-            # Fallback für Legacy Payloads ohne Hash-Dict
            old_hash = None

-        new_hash = note_pl.get("hashes", {}).get(key_current)
+        new_hash = note_pl.get("hashes", {}).get(check_key)
        
-        # Wenn wir keinen alten Hash haben (z.B. neues Hash-Schema "full"), erzwingen wir Update
+        # Vergleich
        hash_changed = (old_hash != new_hash)
        
        chunks_missing, edges_missing = self._artifacts_missing(note_id)
--- a/app/core/note_payload.py
+++ b/app/core/note_payload.py
@ -1,7 +1,10 @@
 """
 FILE: app/core/note_payload.py
-DESCRIPTION: Baut das JSON-Objekt für 'mindnet_notes'. Inkludiert Hash-Berechnung für Change Detection.
-VERSION: 2.2.0 (Fix: Missing Hash Calculation)
+DESCRIPTION: Baut das JSON-Objekt. 
+FEATURES:
+  1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
+  2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
+VERSION: 2.3.0
 STATUS: Active
 DEPENDENCIES: yaml, os, json, pathlib, hashlib
 EXTERNAL_CONFIG: config/types.yaml
@ -77,7 +80,7 @@ def _ensure_list(x) -> list:
        return [str(i) for i in x]
    return [str(x)]

-# --- NEW: Hash Logic ---
+# --- Hash Logic ---
 def _compute_hash(content: str) -> str:
    """Berechnet einen SHA-256 Hash für den gegebenen String."""
    if not content:
@ -87,8 +90,6 @@ def _compute_hash(content: str) -> str:
 def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
    """
    Stellt den String zusammen, der gehasht werden soll.
-    mode="body": Nur der Textinhalt.
-    mode="full": Textinhalt + relevante Metadaten (Titel, Typ, Tags, Status).
    """
    body = str(n.get("body") or "")
    
@ -96,13 +97,12 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
        return body
    
    if mode == "full":
-        # Wir nehmen die wichtigsten strukturellen Metadaten dazu.
-        # Wichtig: Sortierte Keys für deterministisches Verhalten!
        fm = n.get("frontmatter") or {}
-        
-        # Extrahieren relevanter Meta-Felder, die Änderungen auslösen sollen
+        # Wichtig: Sortierte Keys für deterministisches Verhalten!
+        # Wir nehmen alle steuernden Metadaten auf
        meta_parts = []
-        for k in sorted(["title", "type", "status", "tags", "chunk_profile"]):
+        # Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
+        for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
            val = fm.get(k)
            if val is not None:
                meta_parts.append(f"{k}:{val}")
@ -157,14 +157,13 @@ def _cfg_defaults(reg: dict) -> dict:
 def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
    """
    Baut das Note-Payload für mindnet_notes auf.
-    Inkludiert Hash-Berechnung.
+    Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
    """
    n = _as_dict(note)
    path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
    reg = _load_types_config(types_cfg_explicit)
    
-    # Hash Config extrahieren (Defaults wie in ingestion.py)
-    hash_mode = kwargs.get("hash_mode", "full") 
+    # Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
    hash_source = kwargs.get("hash_source", "parsed")
    hash_normalize = kwargs.get("hash_normalize", "canonical")

@ -188,18 +187,21 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
    except Exception:
        retriever_weight = default_rw

-    # --- chunk_profile ---
-    chunk_profile = fm.get("chunk_profile")
+    # --- chunk_profile (FIXED LOGIC) ---
+    # 1. Frontmatter Override (beide Schreibweisen erlaubt)
+    chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
+    
+    # 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
    if chunk_profile is None:
-        chunk_profile = cfg_type.get(
-            "chunk_profile",
-            cfg_def.get(
-                "chunk_profile",
-                os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"),
-            ),
-        )
-    if not isinstance(chunk_profile, str):
-        chunk_profile = "medium"
+        chunk_profile = cfg_type.get("chunking_profile")
+
+    # 3. Default Config (Fallback auf sliding_standard statt medium)
+    if chunk_profile is None:
+        chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
+
+    # 4. Safety Fallback
+    if not isinstance(chunk_profile, str) or not chunk_profile:
+        chunk_profile = "sliding_standard"

    # --- edge_defaults ---
    edge_defaults = fm.get("edge_defaults")
@ -225,20 +227,20 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
        "retriever_weight": retriever_weight,
        "chunk_profile": chunk_profile,
        "edge_defaults": edge_defaults,
+        "hashes": {} # Init Hash Dict
    }
    
-    # --- FIX: Hash Calculation ---
-    # Wir berechnen den Hash basierend auf dem Modus
-    content_to_hash = _get_hash_source_content(n, hash_mode)
-    computed_hash = _compute_hash(content_to_hash)
+    # --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
+    # Wir berechnen immer BEIDE Strategien und speichern sie.
+    # ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
+    modes_to_calc = ["body", "full"]
    
-    # Der Key muss exakt dem entsprechen, was ingestion.py erwartet:
-    # key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
-    hash_key = f"{hash_mode}:{hash_source}:{hash_normalize}"
-    
-    payload["hashes"] = {
-        hash_key: computed_hash
-    }
+    for mode in modes_to_calc:
+        content_to_hash = _get_hash_source_content(n, mode)
+        computed_hash = _compute_hash(content_to_hash)
+        # Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
+        key = f"{mode}:{hash_source}:{hash_normalize}"
+        payload["hashes"][key] = computed_hash

    # Tags / Keywords
    tags = fm.get("tags") or fm.get("keywords") or n.get("tags")