Enhance ingestion_processor.py with path normalization and strict change detection
Implement path normalization to ensure consistent hash checks by converting file paths to absolute paths. Update change detection logic to handle hash comparisons more robustly, treating missing hashes as content changes for safety. This prevents redundant processing and improves efficiency in the ingestion workflow.
This commit is contained in:
parent
742792770c
commit
78fbc9b31b
|
|
@ -248,12 +248,16 @@ class IngestionService:
|
|||
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||
return {**result, "status": "skipped", "reason": "ignored_folder"}
|
||||
|
||||
parsed = read_markdown(file_path)
|
||||
# WP-24c v4.5.9: Path-Normalization für konsistente Hash-Prüfung
|
||||
# Normalisiere file_path zu absolutem Pfad für konsistente Verarbeitung
|
||||
normalized_file_path = os.path.abspath(file_path) if not os.path.isabs(file_path) else file_path
|
||||
|
||||
parsed = read_markdown(normalized_file_path)
|
||||
if not parsed: return {**result, "error": "Empty file"}
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
|
||||
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
|
||||
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=normalized_file_path, types_cfg=self.registry)
|
||||
note_id = note_pl.get("note_id")
|
||||
|
||||
if not note_id:
|
||||
|
|
@ -261,22 +265,36 @@ class IngestionService:
|
|||
|
||||
logger.info(f"📄 Bearbeite: '{note_id}'")
|
||||
|
||||
# Change Detection (WP-24c v4.2.4: Hash-basierte Inhaltsprüfung)
|
||||
# WP-24c v4.5.9: Strikte Change Detection (Hash-basierte Inhaltsprüfung)
|
||||
# Prüft Hash VOR der Verarbeitung, um redundante Ingestion zu vermeiden
|
||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||
|
||||
content_changed = True
|
||||
hash_match = False
|
||||
if old_payload and not force_replace:
|
||||
# Nutzt die über MINDNET_CHANGE_DETECTION_MODE gesteuerte Genauigkeit
|
||||
# Mapping: 'full' -> 'full:parsed:canonical', 'body' -> 'body:parsed:canonical'
|
||||
h_key = f"{self.active_hash_mode or 'full'}:parsed:canonical"
|
||||
new_h = note_pl.get("hashes", {}).get(h_key)
|
||||
old_h = old_payload.get("hashes", {}).get(h_key)
|
||||
if new_h and old_h and new_h == old_h:
|
||||
content_changed = False
|
||||
|
||||
if not (force_replace or content_changed or not old_payload or c_miss or e_miss):
|
||||
return {**result, "status": "unchanged", "note_id": note_id}
|
||||
if new_h and old_h:
|
||||
hash_match = (new_h == old_h)
|
||||
if hash_match:
|
||||
content_changed = False
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] Hash identisch für '{note_id}': {h_key} = {new_h[:16]}...")
|
||||
else:
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] Hash geändert für '{note_id}': alt={old_h[:16]}..., neu={new_h[:16]}...")
|
||||
else:
|
||||
# WP-24c v4.5.9: Wenn Hash fehlt, als geändert behandeln (Sicherheit)
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] Hash fehlt für '{note_id}': new_h={bool(new_h)}, old_h={bool(old_h)}")
|
||||
|
||||
# WP-24c v4.5.9: Strikte Logik - überspringe komplett wenn Hash identisch UND keine Artefakte fehlen
|
||||
# Dies verhindert redundante Embedding-Generierung und Chunk-Verarbeitung
|
||||
if not force_replace and hash_match and old_payload and not c_miss and not e_miss:
|
||||
logger.info(f"⏭️ [SKIP] '{note_id}' unverändert (Hash identisch, alle Artefakte vorhanden)")
|
||||
return {**result, "status": "unchanged", "note_id": note_id, "reason": "hash_identical"}
|
||||
|
||||
if not apply:
|
||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user