Enhance ingestion_processor.py with path normalization and strict change detection

Implement path normalization to ensure consistent hash checks by converting file paths to absolute paths. Update change detection logic to handle hash comparisons more robustly, treating missing hashes as content changes for safety. This prevents redundant processing and improves efficiency in the ingestion workflow.
This commit is contained in:
Lars 2026-01-12 07:53:03 +01:00
parent 742792770c
commit 78fbc9b31b

View File

@ -248,12 +248,16 @@ class IngestionService:
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
return {**result, "status": "skipped", "reason": "ignored_folder"}
parsed = read_markdown(file_path)
# WP-24c v4.5.9: Path-Normalization für konsistente Hash-Prüfung
# Normalisiere file_path zu absolutem Pfad für konsistente Verarbeitung
normalized_file_path = os.path.abspath(file_path) if not os.path.isabs(file_path) else file_path
parsed = read_markdown(normalized_file_path)
if not parsed: return {**result, "error": "Empty file"}
fm = normalize_frontmatter(parsed.frontmatter)
validate_required_frontmatter(fm)
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=normalized_file_path, types_cfg=self.registry)
note_id = note_pl.get("note_id")
if not note_id:
@ -261,22 +265,36 @@ class IngestionService:
logger.info(f"📄 Bearbeite: '{note_id}'")
# Change Detection (WP-24c v4.2.4: Hash-basierte Inhaltsprüfung)
# WP-24c v4.5.9: Strikte Change Detection (Hash-basierte Inhaltsprüfung)
# Prüft Hash VOR der Verarbeitung, um redundante Ingestion zu vermeiden
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
content_changed = True
hash_match = False
if old_payload and not force_replace:
# Nutzt die über MINDNET_CHANGE_DETECTION_MODE gesteuerte Genauigkeit
# Mapping: 'full' -> 'full:parsed:canonical', 'body' -> 'body:parsed:canonical'
h_key = f"{self.active_hash_mode or 'full'}:parsed:canonical"
new_h = note_pl.get("hashes", {}).get(h_key)
old_h = old_payload.get("hashes", {}).get(h_key)
if new_h and old_h and new_h == old_h:
content_changed = False
if new_h and old_h:
hash_match = (new_h == old_h)
if hash_match:
content_changed = False
logger.debug(f"🔍 [CHANGE-DETECTION] Hash identisch für '{note_id}': {h_key} = {new_h[:16]}...")
else:
logger.debug(f"🔍 [CHANGE-DETECTION] Hash geändert für '{note_id}': alt={old_h[:16]}..., neu={new_h[:16]}...")
else:
# WP-24c v4.5.9: Wenn Hash fehlt, als geändert behandeln (Sicherheit)
logger.debug(f"🔍 [CHANGE-DETECTION] Hash fehlt für '{note_id}': new_h={bool(new_h)}, old_h={bool(old_h)}")
if not (force_replace or content_changed or not old_payload or c_miss or e_miss):
return {**result, "status": "unchanged", "note_id": note_id}
# WP-24c v4.5.9: Strikte Logik - überspringe komplett wenn Hash identisch UND keine Artefakte fehlen
# Dies verhindert redundante Embedding-Generierung und Chunk-Verarbeitung
if not force_replace and hash_match and old_payload and not c_miss and not e_miss:
logger.info(f"⏭️ [SKIP] '{note_id}' unverändert (Hash identisch, alle Artefakte vorhanden)")
return {**result, "status": "unchanged", "note_id": note_id, "reason": "hash_identical"}
if not apply:
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}