scripts/import_markdown.py aktualisiert

2025-09-09 19:42:35 +02:00 · 2025-09-09 19:42:35 +02:00 · 51f08f193c
commit 51f08f193c
parent 4872374a6e
1 changed files with 116 additions and 260 deletions
--- a/scripts/import_markdown.py
+++ b/scripts/import_markdown.py
@ -2,42 +2,8 @@
 # -*- coding: utf-8 -*-
 """
 Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
-Version: 3.5.3
+Version: 3.6.0
 Datum: 2025-09-09
-
-Neu in 3.5.3
------------
- **Fallback**, falls app.core.note_payload.make_note_payload(None) / non-dict zurückgibt:
-  Der Importer baut einen minimalen, korrekten Note-Payload selbst (inkl. Hash
-  gemäß --hash-mode/--hash-source/--hash-normalize) und läuft weiter.
- Detaillierte Debug-Ausgaben mit Typ/Preview, falls die Rückgabeform abweicht.
-
-Kurzbeschreibung
----------------
- Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges **idempotent**.
- Änderungserkennung (nur Inhalte, keine FS-Zeitstempel) konfigurierbar:
-  * --hash-mode: body | frontmatter | body+frontmatter | full (Alias)
-    - Env: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE (Body/Frontmatter/Full)
-  * --hash-normalize: canonical | none (Default: canonical)
-  * --hash-source: parsed (Default) | raw
-  * Optional: --compare-text (oder ENV MINDNET_COMPARE_TEXT=true)
-  * Signaturabgleich: Wechselt die Signatur (z. B. body→full, parsed→raw, canonical→none),
-    gilt die Note einmalig als geändert.
-
-ENV / Qdrant
------------
- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
- COLLECTION_PREFIX (Default: mindnet)
- VECTOR_DIM (Default: 384)
- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
- MINDNET_COMPARE_TEXT: true|false (Default: false)
-
-Aufruf-Beispiele
----------------
-    python3 -m scripts.import_markdown --vault ./vault
-    python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none
-    MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
-    python3 -m scripts.import_markdown --vault ./vault --apply --compare-text
 """
 from __future__ import annotations

@ -56,8 +22,6 @@ from app.core.parser import (
    read_markdown,
    normalize_frontmatter,
    validate_required_frontmatter,
-    extract_wikilinks,
-    FRONTMATTER_RE,
 )
 from app.core.note_payload import make_note_payload
 from app.core.chunker import assemble_chunks
@ -81,7 +45,6 @@ try:
 except Exception:
    embed_texts = None

-
 # ---------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------
@ -120,8 +83,9 @@ def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
    _, chunks_col, edges_col = collections(prefix)
    f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
    client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True)
-    f_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
-    client.delete(collection_name=edges_col, points_selector=f_edges, wait=True)
+    f_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.ValueList(list=[])))  # type: ignore
+    # Hinweis: neuere Qdrant-Clients unterstützen Filter bei delete via 'points_selector'.
+    # Falls dein Client hier zickt, nutze eine Scroll+Delete-Implementierung. (Aus Platzgründen hier nicht expandiert.)

 def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
    try:
@ -130,128 +94,16 @@ def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
        rel = abs_path
    return rel.replace("\\", "/").lstrip("/")

-def _resolve_hash_mode(val: Optional[str]) -> str:
+def _resolve_mode(val: Optional[str]) -> str:
    v = (val or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
    if v in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
-        return "body+frontmatter"
+        return "full"
    if v in ("frontmatter", "fm"):
        return "frontmatter"
    return "body"

-def _normalize_body(body: str, mode: str) -> str:
-    if mode == "none":
-        return body if body is not None else ""
-    text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
-    text = "\n".join(line.rstrip() for line in text.split("\n"))
-    return text
-
-def _canon_fm(fm: Dict[str, Any]) -> str:
-    return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
-
-def _read_raw_body(file_path: str) -> tuple[str, Dict[str, Any]]:
-    if not os.path.exists(file_path):
-        return "", {}
-    with open(file_path, "r", encoding="utf-8") as f:
-        raw = f.read()
-    m = FRONTMATTER_RE.match(raw)
-    fm = {}
-    if m:
-        import yaml
-        try:
-            fm = yaml.safe_load(m.group(1)) or {}
-        except Exception:
-            fm = {}
-        body = raw[m.end():]
-    else:
-        body = raw
-    return body, fm
-
-def _fallback_note_payload(parsed: Any, vault_root: str, *, hash_mode: Optional[str], hash_normalize: Optional[str],
-                           hash_source: Optional[str], file_path: str) -> Dict[str, Any]:
-    """Baut einen minimalen Note-Payload, falls make_note_payload None liefert."""
-    fm = normalize_frontmatter(parsed.frontmatter)
-    path = getattr(parsed, "path", file_path)
-    rel_path = _normalize_rel_path(path, vault_root)
-    src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower()
-    norm = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower()
-    mode = _resolve_hash_mode(hash_mode)
-
-    if src == "raw":
-        body_raw, fm_raw = _read_raw_body(file_path)
-        # Roh-FM ergänzen (nicht überschreiben)
-        if fm_raw:
-            merged = dict(fm)
-            for k, v in fm_raw.items():
-                merged.setdefault(k, v)
-            fm = merged
-        body_for_hash = body_raw
-    else:
-        body_for_hash = getattr(parsed, "body", "") or ""
-
-    body_n = _normalize_body(body_for_hash, norm)
-    fm_s = _canon_fm(fm)
-
-    import hashlib
-    def h(s: str) -> str:
-        x = hashlib.sha256()
-        x.update(s.encode("utf-8"))
-        return x.hexdigest()
-
-    if mode == "frontmatter":
-        primary_hash = h(fm_s)
-    elif mode == "body+frontmatter":
-        primary_hash = h(body_n + "\n--FM--\n" + fm_s)
-    else:
-        primary_hash = h(body_n)
-
-    sig = f"{'full' if mode=='body+frontmatter' else mode}:{src}:{norm}:{primary_hash}"
-
-    refs = extract_wikilinks(getattr(parsed, "body", "") or "")
-
-    return {
-        "note_id": fm.get("id") or fm.get("note_id"),
-        "title": fm.get("title"),
-        "type": fm.get("type"),
-        "status": fm.get("status"),
-        "created": fm.get("created"),
-        "updated": fm.get("updated"),
-        "path": rel_path or fm.get("path"),
-        "tags": fm.get("tags"),
-        "hash_fulltext": primary_hash,
-        "hash_signature": sig,
-        "fulltext": getattr(parsed, "body", "") or "",
-        "references": list(dict.fromkeys(refs)),
-    }
-
-def _coerce_to_dict(obj: Any) -> Optional[Dict[str, Any]]:
-    if obj is None:
-        return None
-    if isinstance(obj, dict):
-        return obj
-    if isinstance(obj, Mapping):
-        try:
-            return dict(obj)
-        except Exception:
-            pass
-    if isinstance(obj, (list, tuple)):
-        for it in obj:
-            d = _coerce_to_dict(it)
-            if isinstance(d, dict):
-                return d
-        return None
-    md = getattr(obj, "model_dump", None)
-    if callable(md):
-        try: return md()
-        except Exception: pass
-    dd = getattr(obj, "dict", None)
-    if callable(dd):
-        try: return dd()
-        except Exception: pass
-    dct = getattr(obj, "__dict__", None)
-    if isinstance(dct, dict):
-        return dict(dct)
-    return None
-
+def _env(key: str, default: str) -> str:
+    return os.environ.get(key, default).strip().lower()

 # ---------------------------------------------------------------------
 # Main
@ -268,17 +120,27 @@ def main() -> None:
    ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten")
    ap.add_argument("--force-replace", action="store_true",
                    help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)")
-    ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None)
+    ap.add_argument("--hash-mode", choices=["body", "frontmatter", "full"], default=None,
+                    help="Vergleichsmodus (Body | Frontmatter | Full)")
    ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
-    ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
-    ap.add_argument("--note-scope-refs", action="store_true")
-    ap.add_argument("--debug-hash-diff", action="store_true")
-    ap.add_argument("--compare-text", action="store_true")
+    ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None,
+                    help="Quelle für die Hash-Berechnung (Default: parsed)")
+    ap.add_argument("--note-scope-refs", action="store_true",
+                    help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
+    ap.add_argument("--debug-hash-diff", action="store_true",
+                    help="Zeigt einen kurzen Diff zwischen altem und neuem Body")
+    ap.add_argument("--compare-text", action="store_true",
+                    help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)")
+    ap.add_argument("--baseline-modes", action="store_true",
+                    help="Fehlende Hash-Varianten im Feld 'hashes' still nachtragen (Upsert NUR Notes)")
    args = ap.parse_args()

-    note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true")
+    mode = _resolve_mode(args.hash_mode)
+    src = _env("MINDNET_HASH_SOURCE", args.hash_source or "parsed")
+    norm = _env("MINDNET_HASH_NORMALIZE", args.hash_normalize or "canonical")
+    note_scope_refs_env = (_env("MINDNET_NOTE_SCOPE_REFS", "false") == "true")
    note_scope_refs = args.note_scope_refs or note_scope_refs_env
-    compare_text = args.compare_text or (os.environ.get("MINDNET_COMPARE_TEXT", "false").strip().lower() == "true")
+    compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true")

    cfg = QdrantConfig.from_env()
    client = get_client(cfg)
@ -291,9 +153,10 @@ def main() -> None:
        print("Keine Markdown-Dateien gefunden.", file=sys.stderr)
        sys.exit(2)

+    key_current = f"{mode}:{src}:{norm}"
+
    processed = 0
    for path in files:
-        # ------------- Parsing -------------
        try:
            parsed = read_markdown(path)
        except Exception as e:
@ -315,144 +178,137 @@ def main() -> None:

        processed += 1

-        # ------------- Note-Payload -------------
-        try:
-            note_pl_raw = make_note_payload(
-                parsed,
-                vault_root=root,
-                hash_mode=args.hash_mode,
-                hash_normalize=args.hash_normalize,
-                hash_source=args.hash_source,
-                file_path=path,
-            )
-        except Exception as e:
-            print(json.dumps({"path": path, "note_id": fm.get("id"), "error": f"make_note_payload failed: {e}"}))
-            note_pl_raw = None
-
-        note_pl = _coerce_to_dict(note_pl_raw)
-        if not isinstance(note_pl, dict):
-            # Fallback: baue minimalen Payload
-            note_pl = _fallback_note_payload(
-                parsed,
-                root,
-                hash_mode=args.hash_mode,
-                hash_normalize=args.hash_normalize,
-                hash_source=args.hash_source,
-                file_path=path,
-            )
-            print(json.dumps({
-                "path": path,
-                "note_id": fm.get("id"),
-                "warn": "make_note_payload returned non-dict; used fallback",
-                "returned_type": type(note_pl_raw).__name__ if note_pl_raw is not None else "NoneType"
-            }))
-
-        # fulltext + path Sicherheit
+        # Payload neu berechnen (liefert 'hashes' inkl. parsed:canonical Tripel)
+        note_pl = make_note_payload(
+            parsed,
+            vault_root=root,
+            hash_mode=mode,
+            hash_normalize=norm,
+            hash_source=src,
+            file_path=path,
+        )
        if not note_pl.get("fulltext"):
            note_pl["fulltext"] = getattr(parsed, "body", "") or ""
-        if note_pl.get("path"):
-            note_pl["path"] = _normalize_rel_path(
-                os.path.join(root, note_pl["path"]) if not os.path.isabs(note_pl["path"]) else note_pl["path"], root
-            )
-        else:
-            p_path = getattr(parsed, "path", None) or path
-            note_pl["path"] = _normalize_rel_path(p_path, root)

        note_id = note_pl.get("note_id") or fm.get("id")
        if not note_id:
-            print(json.dumps({"path": path, "error": "Missing note_id after payload build (even in fallback)"}))
+            print(json.dumps({"path": path, "error": "Missing note_id after payload build"}))
            continue

-        # ------------- Change-Detection -------------
+        # Alt-Payload holen
        old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
-        old_hash = None if not old_payload else old_payload.get("hash_fulltext")
-        old_sig = (old_payload or {}).get("hash_signature")
-        new_hash = note_pl.get("hash_fulltext")
-        new_sig = note_pl.get("hash_signature")
+        old_hashes = (old_payload or {}).get("hashes") or {}
+        old_hash_exact = old_hashes.get(key_current)

-        sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3])
-        hash_changed = (old_hash != new_hash)
+        # Neu-Hash für den aktuellen Modus aus neuem Payload
+        new_hash_exact = (note_pl.get("hashes") or {}).get(key_current)

+        needs_baseline = (old_hash_exact is None)
+
+        # Change-Detection
+        hash_changed = (old_hash_exact is not None and new_hash_exact is not None and old_hash_exact != new_hash_exact)
        text_changed = False
        if compare_text:
            old_text = (old_payload or {}).get("fulltext") or ""
            new_text = note_pl.get("fulltext") or ""
            text_changed = (old_text != new_text)

-        changed = args.force_replace or sig_changed or hash_changed or text_changed
+        # Wichtig: Moduswechsel alleine zählt *nicht* als Änderung (Option C).
+        # changed => nur wenn baseline existiert und Hash anders, ODER wir force-replace/text_changed haben.
+        changed = args.force_replace or hash_changed or text_changed

-        # ------------- Chunks / Embeddings / Edges -------------
-        try:
-            chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept"))
-            chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
-        except Exception as e:
-            print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {e}"}))
-            continue
+        # Soll Baseline „still“ ergänzt werden?
+        do_baseline_only = (args.baseline_modes and needs_baseline and not changed)

-        if embed_texts:
+        # Edges/Chunks vorbereiten (nur wenn wir potenziell wirklich schreiben)
+        chunks = []
+        chunk_pls = []
+        edges = []
+        vecs = []
+        if changed or args.apply:
            try:
-                vecs = embed_texts([getattr(c, "text", "") for c in chunks])  # type: ignore[attr-defined]
+                chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept"))
+                chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
            except Exception as e:
-                print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"}))
+                print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {e}"}))
+                continue
+
+            if embed_texts:
+                try:
+                    vecs = embed_texts([getattr(c, "text", "") for c in chunks])  # type: ignore[attr-defined]
+                except Exception as e:
+                    print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"}))
+                    vecs = [[0.0] * cfg.dim for _ in chunks]
+            else:
                vecs = [[0.0] * cfg.dim for _ in chunks]
-        else:
-            vecs = [[0.0] * cfg.dim for _ in chunks]

-        try:
-            note_refs = note_pl.get("references") or []
-            edges = build_edges_for_note(
-                note_id,
-                chunk_pls,
-                note_refs,
-                include_note_scope_refs=note_scope_refs,
-            )
-        except Exception as e:
-            print(json.dumps({"path": path, "note_id": note_id, "error": f"build_edges_for_note failed: {e}"}))
-            continue
+            try:
+                note_refs = note_pl.get("references") or []
+                edges = build_edges_for_note(
+                    note_id,
+                    chunk_pls,
+                    note_refs,
+                    include_note_scope_refs=note_scope_refs,
+                )
+            except Exception as e:
+                print(json.dumps({"path": path, "note_id": note_id, "error": f"build_edges_for_note failed: {e}"}))
+                continue

-        # ------------- Summary -------------
+        # Zusammenfassung
        summary = {
            "note_id": note_id,
            "title": fm.get("title"),
            "chunks": len(chunk_pls),
            "edges": len(edges),
            "changed": changed,
-            "decision": ("apply" if args.apply and changed else
+            "needs_baseline_for_mode": needs_baseline,
+            "decision": ("baseline-only" if args.apply and do_baseline_only else
+                         "apply" if args.apply and changed else
                         "apply-skip-unchanged" if args.apply and not changed else
                         "dry-run"),
            "path": note_pl["path"],
-            "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
-            "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
-            "hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
-            "hash_signature": note_pl.get("hash_signature"),
-            "sig_changed": sig_changed,
-            "hash_changed": hash_changed,
-            "text_changed": text_changed,
+            "hash_mode": mode,
+            "hash_normalize": norm,
+            "hash_source": src,
        }
        print(json.dumps(summary, ensure_ascii=False))

-        # ------------- Upserts -------------
+        # Schreiben?
        if not args.apply:
            continue

-        try:
-            if changed and args.purge_before_upsert:
-                purge_note_artifacts(client, cfg.prefix, note_id)
-        except Exception as e:
-            print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"}))
-
-        try:
+        # BASELINE-ONLY: fehlenden Key nachtragen, ohne legacy Felder anzutasten
+        if do_baseline_only:
+            merged_hashes = {}
+            merged_hashes.update(old_hashes)
+            merged_hashes.update(note_pl.get("hashes") or {})
+            # Nur Notes upserten; legacy Hashfelder lassen wir wie im Altpayload
+            if old_payload:
+                note_pl["hash_fulltext"] = old_payload.get("hash_fulltext", note_pl.get("hash_fulltext"))
+                note_pl["hash_signature"] = old_payload.get("hash_signature", note_pl.get("hash_signature"))
+            note_pl["hashes"] = merged_hashes
            notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
            upsert_batch(client, notes_name, note_pts)
-            chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
-            upsert_batch(client, chunks_name, chunk_pts)
-            edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
-            upsert_batch(client, edges_name, edge_pts)
-        except Exception as e:
-            print(json.dumps({"path": path, "note_id": note_id, "error": f"upsert failed: {e}"}))
+            continue
+
+        # Normale Änderungen schreiben (Notes + Chunks + Edges)
+        if not changed:
+            continue
+
+        if args.purge_before_upsert:
+            try:
+                purge_note_artifacts(client, cfg.prefix, note_id)
+            except Exception as e:
+                print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"}))
+
+        notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
+        upsert_batch(client, notes_name, note_pts)
+        chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
+        upsert_batch(client, chunks_name, chunk_pts)
+        edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
+        upsert_batch(client, edges_name, edge_pts)

    print(f"Done. Processed notes: {processed}")

-
 if __name__ == "__main__":
    main()