diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index d3c40f4..9cd2d78 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -248,12 +248,16 @@ class IngestionService: if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)): return {**result, "status": "skipped", "reason": "ignored_folder"} - parsed = read_markdown(file_path) + # WP-24c v4.5.9: Path-Normalization für konsistente Hash-Prüfung + # Normalisiere file_path zu absolutem Pfad für konsistente Verarbeitung + normalized_file_path = os.path.abspath(file_path) if not os.path.isabs(file_path) else file_path + + parsed = read_markdown(normalized_file_path) if not parsed: return {**result, "error": "Empty file"} fm = normalize_frontmatter(parsed.frontmatter) validate_required_frontmatter(fm) - note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry) + note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=normalized_file_path, types_cfg=self.registry) note_id = note_pl.get("note_id") if not note_id: @@ -261,22 +265,36 @@ class IngestionService: logger.info(f"📄 Bearbeite: '{note_id}'") - # Change Detection (WP-24c v4.2.4: Hash-basierte Inhaltsprüfung) + # WP-24c v4.5.9: Strikte Change Detection (Hash-basierte Inhaltsprüfung) + # Prüft Hash VOR der Verarbeitung, um redundante Ingestion zu vermeiden old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id) c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id) content_changed = True + hash_match = False if old_payload and not force_replace: # Nutzt die über MINDNET_CHANGE_DETECTION_MODE gesteuerte Genauigkeit # Mapping: 'full' -> 'full:parsed:canonical', 'body' -> 'body:parsed:canonical' h_key = f"{self.active_hash_mode or 'full'}:parsed:canonical" new_h = note_pl.get("hashes", {}).get(h_key) old_h = old_payload.get("hashes", {}).get(h_key) - if new_h and old_h and new_h == old_h: - content_changed = False + + if new_h and old_h: + hash_match = (new_h == old_h) + if hash_match: + content_changed = False + logger.debug(f"🔍 [CHANGE-DETECTION] Hash identisch für '{note_id}': {h_key} = {new_h[:16]}...") + else: + logger.debug(f"🔍 [CHANGE-DETECTION] Hash geändert für '{note_id}': alt={old_h[:16]}..., neu={new_h[:16]}...") + else: + # WP-24c v4.5.9: Wenn Hash fehlt, als geändert behandeln (Sicherheit) + logger.debug(f"🔍 [CHANGE-DETECTION] Hash fehlt für '{note_id}': new_h={bool(new_h)}, old_h={bool(old_h)}") - if not (force_replace or content_changed or not old_payload or c_miss or e_miss): - return {**result, "status": "unchanged", "note_id": note_id} + # WP-24c v4.5.9: Strikte Logik - überspringe komplett wenn Hash identisch UND keine Artefakte fehlen + # Dies verhindert redundante Embedding-Generierung und Chunk-Verarbeitung + if not force_replace and hash_match and old_payload and not c_miss and not e_miss: + logger.info(f"⏭️ [SKIP] '{note_id}' unverändert (Hash identisch, alle Artefakte vorhanden)") + return {**result, "status": "unchanged", "note_id": note_id, "reason": "hash_identical"} if not apply: return {**result, "status": "dry-run", "changed": True, "note_id": note_id}