scripts/import_markdown.py aktualisiert

2025-09-09 16:31:00 +02:00 · 2025-09-09 16:31:00 +02:00 · 43a8c80508
commit 43a8c80508
parent 9362064ad1
1 changed files with 53 additions and 16 deletions
--- a/scripts/import_markdown.py
+++ b/scripts/import_markdown.py
@ -2,19 +2,24 @@
 # -*- coding: utf-8 -*-
 """
 Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
-Version: 3.4.1
+Version: 3.5.0
 Datum: 2025-09-09

 Kurzbeschreibung
 ----------------
 - Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges **idempotent**.
- Change-Detection (nur **Inhalte**, keine FS-Zeitstempel) konfigurierbar:
+- **Änderungserkennung** (nur Inhalte, keine FS-Zeitstempel) konfigurierbar:
  * ``--hash-mode``: body | frontmatter | body+frontmatter | full (Alias)
    - Env: ``MINDNET_HASH_MODE`` **oder** ``MINDNET_HASH_COMPARE`` (Body/Frontmatter/Full)
  * ``--hash-normalize``: canonical | none (Default: canonical)
  * ``--hash-source``: parsed (Default) | raw
-    - "raw" hasht den **ungeparsten** Body aus der Datei (Frontmatter vorher entfernt).
- Optionales Diff-Logging: ``--debug-hash-diff`` zeigt bei Bedarf einen kompakten Diff.
+    - "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt).
+  * **NEU**: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``)
+    - Zusätzlich zum Hash wird der *parsed* ``fulltext`` direkt verglichen.
+    - Erkennt Änderungen auch dann, wenn eine Normalisierung Unterschiede glättet.
+  * **NEU**: Signaturabgleich:
+    - Falls sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet,
+      gilt die Note als **geändert** (Einmal-Update, um neue Signatur zu persistieren).

 ENV / Qdrant
 ------------
@ -22,15 +27,22 @@ ENV / Qdrant
 - COLLECTION_PREFIX (Default: mindnet)
 - VECTOR_DIM (Default: 384)
 - MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
+- MINDNET_COMPARE_TEXT: true|false (Default: false)

-Aufruf
------
+Aufruf-Beispiele
+----------------
+    # Standard (nur Body, kanonisiert, parsed-Quelle)
    python3 -m scripts.import_markdown --vault ./vault
-    python3 -m scripts.import_markdown --vault ./vault --apply
-    python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none
-    MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
-"""

+    # Sensibel (jede Kleinigkeit): raw-Quelle + keine Normalisierung
+    python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none
+
+    # Full-Vergleich (Body+Frontmatter)
+    MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
+
+    # Zusätzlich Body-Text direkt vergleichen (maximale Sicherheit)
+    python3 -m scripts.import_markdown --vault ./vault --apply --compare-text
+"""
 from __future__ import annotations
 import argparse
 import difflib
@ -143,11 +155,14 @@ def main() -> None:
                    help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
    ap.add_argument("--debug-hash-diff", action="store_true",
                    help="Zeigt bei Bedarf einen kurzen Diff zwischen altem und neuem Body")
+    ap.add_argument("--compare-text", action="store_true",
+                    help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)")
    args = ap.parse_args()

    note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true")
-    note_scope_refs = args.note_sCope_refs if hasattr(args, "note_sCope_refs") else args.note_scope_refs  # defensive
-    note_scope_refs = note_scope_refs or note_scope_refs_env
+    note_scope_refs = args.note_scope_refs or note_scope_refs_env
+
+    compare_text = args.compare_text or (os.environ.get("MINDNET_COMPARE_TEXT", "false").strip().lower() == "true")

    cfg = QdrantConfig.from_env()
    client = get_client(cfg)
@ -200,18 +215,36 @@ def main() -> None:
        # Change-Detection (nur Inhalte, keine FS-Timestamps)
        old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
        old_hash = None if not old_payload else old_payload.get("hash_fulltext")
+        old_sig = (old_payload or {}).get("hash_signature")
        new_hash = note_pl.get("hash_fulltext")
-        changed = args.force_replace or (old_hash != new_hash)
+        new_sig = note_pl.get("hash_signature")
+
+        # 1) Signaturwechsel → als geändert behandeln (Einmal-Update)
+        sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3])
+
+        # 2) Hash-Vergleich
+        hash_changed = (old_hash != new_hash)
+
+        # 3) Optional: Parsed-Text direkt vergleichen (zusätzlich)
+        text_changed = False
+        if compare_text:
+            old_text = (old_payload or {}).get("fulltext") or ""
+            new_text = note_pl.get("fulltext") or ""
+            text_changed = (old_text != new_text)
+
+        changed = args.force_replace or sig_changed or hash_changed or text_changed

        # Optionales Debugging: kompakten Diff anzeigen
        if args.debug_hash_diff:
            old_text = (old_payload or {}).get("fulltext") or ""
            new_text = note_pl.get("fulltext") or ""
-            # Wenn Hash gleich, aber Text verschieden → Hinweis auf Normalisierung/Quelle
-            if (old_hash == new_hash) and old_text != new_text:
+            # Hinweis, wenn Hash gleich aber Text verschieden (oder Signaturwechsel)
+            if (not hash_changed) and (old_text != new_text or sig_changed):
                print(json.dumps({
-                    "debug": "hash_equal_but_text_differs",
+                    "debug": "hash_equal_but_text_or_signature_differs",
                    "note_id": note_id,
+                    "old_sig": old_sig,
+                    "new_sig": new_sig,
                    "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
                    "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
                    "hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
@ -257,6 +290,10 @@ def main() -> None:
            "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
            "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
            "hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
+            "hash_signature": note_pl.get("hash_signature"),
+            "sig_changed": sig_changed,
+            "hash_changed": hash_changed,
+            "text_changed": text_changed,
        }
        print(json.dumps(summary, ensure_ascii=False))