scripts/import_markdown.py aktualisiert

2025-09-09 16:36:27 +02:00 · 2025-09-09 16:36:27 +02:00 · f5e6fcc097
commit f5e6fcc097
parent 9ef2e8d397
1 changed files with 93 additions and 76 deletions
--- a/scripts/import_markdown.py
+++ b/scripts/import_markdown.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """
 Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
-Version: 3.5.0
+Version: 3.5.1
 Datum: 2025-09-09

 Kurzbeschreibung
@ -14,12 +14,11 @@ Kurzbeschreibung
  * ``--hash-normalize``: canonical | none (Default: canonical)
  * ``--hash-source``: parsed (Default) | raw
    - "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt).
-  * **NEU**: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``)
-    - Zusätzlich zum Hash wird der *parsed* ``fulltext`` direkt verglichen.
-    - Erkennt Änderungen auch dann, wenn eine Normalisierung Unterschiede glättet.
-  * **NEU**: Signaturabgleich:
-    - Falls sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet,
-      gilt die Note als **geändert** (Einmal-Update, um neue Signatur zu persistieren).
+  * Optional: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``)
+    - parsed ``fulltext`` zusätzlich direkt vergleichen (falls Normalisierung Unterschiede glättet).
+  * Signaturabgleich:
+    - Wenn sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet,
+      gilt die Note als **geändert** (Einmal-Update, um die neue Signatur zu persistieren).

 ENV / Qdrant
 ------------
@ -40,7 +39,7 @@ Aufruf-Beispiele
    # Full-Vergleich (Body+Frontmatter)
    MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply

-    # Zusätzlich Body-Text direkt vergleichen (maximale Sicherheit)
+    # Zusätzlich Body-Text direkt vergleichen (max. Sicherheit)
    python3 -m scripts.import_markdown --vault ./vault --apply --compare-text
 """
 from __future__ import annotations
@ -177,8 +176,21 @@ def main() -> None:

    processed = 0
    for path in files:
+        # ----------------- robustes Parsing -----------------
+        try:
            parsed = read_markdown(path)
+        except Exception as e:
+            print(json.dumps({"path": path, "error": f"read_markdown failed: {e}"}))
+            continue
+        if parsed is None:
+            print(json.dumps({"path": path, "error": "read_markdown returned None"}))
+            continue
+
+        try:
            fm = normalize_frontmatter(parsed.frontmatter)
+        except Exception as e:
+            print(json.dumps({"path": path, "error": f"normalize_frontmatter failed: {e}"}))
+            continue

        try:
            validate_required_frontmatter(fm)
@ -191,7 +203,8 @@ def main() -> None:

        processed += 1

-        # Note-Payload (inkl. Hash-Steuerung & Quelle)
+        # -------------- Note-Payload (defensiv) --------------
+        try:
            note_pl = make_note_payload(
                parsed,
                vault_root=root,
@ -200,32 +213,40 @@ def main() -> None:
                hash_source=args.hash_source,
                file_path=path,
            )
+        except Exception as e:
+            print(json.dumps({"path": path, "note_id": fm.get("id"), "error": f"make_note_payload failed: {e}"}))
+            continue

-        if "fulltext" not in (note_pl or {}):
-            note_pl["fulltext"] = parsed.body or ""
+        if not isinstance(note_pl, dict):
+            print(json.dumps({"path": path, "note_id": fm.get("id"), "error": "make_note_payload returned non-dict"}))
+            continue
+
+        # fulltext sicherstellen + Pfad normalisieren
+        if not note_pl.get("fulltext"):
+            note_pl["fulltext"] = getattr(parsed, "body", "") or ""
        if note_pl.get("path"):
            note_pl["path"] = _normalize_rel_path(
                os.path.join(root, note_pl["path"]) if not os.path.isabs(note_pl["path"]) else note_pl["path"], root
            )
        else:
-            note_pl["path"] = _normalize_rel_path(parsed.path, root)
+            p_path = getattr(parsed, "path", None) or path
+            note_pl["path"] = _normalize_rel_path(p_path, root)

-        note_id = note_pl["note_id"]
+        note_id = note_pl.get("note_id") or fm.get("id")
+        if not note_id:
+            print(json.dumps({"path": path, "error": "Missing note_id after payload build"}))
+            continue

-        # Change-Detection (nur Inhalte, keine FS-Timestamps)
+        # -------------- Change-Detection --------------
        old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
        old_hash = None if not old_payload else old_payload.get("hash_fulltext")
        old_sig = (old_payload or {}).get("hash_signature")
        new_hash = note_pl.get("hash_fulltext")
        new_sig = note_pl.get("hash_signature")

-        # 1) Signaturwechsel → als geändert behandeln (Einmal-Update)
        sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3])
-
-        # 2) Hash-Vergleich
        hash_changed = (old_hash != new_hash)

-        # 3) Optional: Parsed-Text direkt vergleichen (zusätzlich)
        text_changed = False
        if compare_text:
            old_text = (old_payload or {}).get("fulltext") or ""
@ -234,40 +255,29 @@ def main() -> None:

        changed = args.force_replace or sig_changed or hash_changed or text_changed

-        # Optionales Debugging: kompakten Diff anzeigen
-        if args.debug_hash_diff:
-            old_text = (old_payload or {}).get("fulltext") or ""
-            new_text = note_pl.get("fulltext") or ""
-            # Hinweis, wenn Hash gleich aber Text verschieden (oder Signaturwechsel)
-            if (not hash_changed) and (old_text != new_text or sig_changed):
-                print(json.dumps({
-                    "debug": "hash_equal_but_text_or_signature_differs",
-                    "note_id": note_id,
-                    "old_sig": old_sig,
-                    "new_sig": new_sig,
-                    "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
-                    "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
-                    "hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
-                }, ensure_ascii=False))
-            if old_text and new_text:
-                ud = list(difflib.unified_diff(
-                    old_text.splitlines(), new_text.splitlines(),
-                    fromfile="qdrant_fulltext(old)", tofile="vault_body(new)",
-                    n=3
-                ))
-                if ud:
-                    preview = "\n".join(ud[:50])
-                    print(json.dumps({"note_id": note_id, "diff_preview": preview}, ensure_ascii=False))
+        # -------------- Chunks / Embeddings / Edges --------------
+        try:
+            chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept"))
+        except Exception as e:
+            print(json.dumps({"path": path, "note_id": note_id, "error": f"assemble_chunks failed: {e}"}))
+            continue

-        # Chunks + Embeddings (Nullvektor-Fallback)
-        chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
+        try:
            chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
+        except Exception as e:
+            print(json.dumps({"path": path, "note_id": note_id, "error": f"make_chunk_payloads failed: {e}"}))
+            continue
+
        if embed_texts:
+            try:
                vecs = embed_texts([getattr(c, "text", "") for c in chunks])  # type: ignore[attr-defined]
+            except Exception as e:
+                print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"}))
+                vecs = [[0.0] * cfg.dim for _ in chunks]
        else:
            vecs = [[0.0] * cfg.dim for _ in chunks]

-        # Edges (neues Schema, mit note_id als Owner)
+        try:
            note_refs = note_pl.get("references") or []
            edges = build_edges_for_note(
                note_id,
@ -275,8 +285,11 @@ def main() -> None:
                note_refs,
                include_note_scope_refs=note_scope_refs,
            )
+        except Exception as e:
+            print(json.dumps({"path": path, "note_id": note_id, "error": f"build_edges_for_note failed: {e}"}))
+            continue

-        # Zusammenfassung pro Datei
+        # -------------- Zusammenfassung / Ausgabe --------------
        summary = {
            "note_id": note_id,
            "title": fm.get("title"),
@ -297,21 +310,25 @@ def main() -> None:
        }
        print(json.dumps(summary, ensure_ascii=False))

+        # -------------- Upserts --------------
        if not args.apply:
            continue

+        try:
            if changed and args.purge_before_upsert:
                purge_note_artifacts(client, cfg.prefix, note_id)
+        except Exception as e:
+            print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"}))

-        # Upserts
+        try:
            notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
            upsert_batch(client, notes_name, note_pts)
-
            chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
            upsert_batch(client, chunks_name, chunk_pts)
-
            edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
            upsert_batch(client, edges_name, edge_pts)
+        except Exception as e:
+            print(json.dumps({"path": path, "note_id": note_id, "error": f"upsert failed: {e}"}))

    print(f"Done. Processed notes: {processed}")