From f5e6fcc097ffd300f20576020c86b722738b4cbd Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 9 Sep 2025 16:36:27 +0200 Subject: [PATCH] scripts/import_markdown.py aktualisiert --- scripts/import_markdown.py | 169 ++++++++++++++++++++----------------- 1 file changed, 93 insertions(+), 76 deletions(-) diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 149afc1..1d870d9 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges) -Version: 3.5.0 +Version: 3.5.1 Datum: 2025-09-09 Kurzbeschreibung @@ -14,12 +14,11 @@ Kurzbeschreibung * ``--hash-normalize``: canonical | none (Default: canonical) * ``--hash-source``: parsed (Default) | raw - "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt). - * **NEU**: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``) - - Zusätzlich zum Hash wird der *parsed* ``fulltext`` direkt verglichen. - - Erkennt Änderungen auch dann, wenn eine Normalisierung Unterschiede glättet. - * **NEU**: Signaturabgleich: - - Falls sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet, - gilt die Note als **geändert** (Einmal-Update, um neue Signatur zu persistieren). + * Optional: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``) + - parsed ``fulltext`` zusätzlich direkt vergleichen (falls Normalisierung Unterschiede glättet). + * Signaturabgleich: + - Wenn sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet, + gilt die Note als **geändert** (Einmal-Update, um die neue Signatur zu persistieren). ENV / Qdrant ------------ @@ -40,7 +39,7 @@ Aufruf-Beispiele # Full-Vergleich (Body+Frontmatter) MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply - # Zusätzlich Body-Text direkt vergleichen (maximale Sicherheit) + # Zusätzlich Body-Text direkt vergleichen (max. Sicherheit) python3 -m scripts.import_markdown --vault ./vault --apply --compare-text """ from __future__ import annotations @@ -177,8 +176,21 @@ def main() -> None: processed = 0 for path in files: - parsed = read_markdown(path) - fm = normalize_frontmatter(parsed.frontmatter) + # ----------------- robustes Parsing ----------------- + try: + parsed = read_markdown(path) + except Exception as e: + print(json.dumps({"path": path, "error": f"read_markdown failed: {e}"})) + continue + if parsed is None: + print(json.dumps({"path": path, "error": "read_markdown returned None"})) + continue + + try: + fm = normalize_frontmatter(parsed.frontmatter) + except Exception as e: + print(json.dumps({"path": path, "error": f"normalize_frontmatter failed: {e}"})) + continue try: validate_required_frontmatter(fm) @@ -191,41 +203,50 @@ def main() -> None: processed += 1 - # Note-Payload (inkl. Hash-Steuerung & Quelle) - note_pl = make_note_payload( - parsed, - vault_root=root, - hash_mode=args.hash_mode, - hash_normalize=args.hash_normalize, - hash_source=args.hash_source, - file_path=path, - ) + # -------------- Note-Payload (defensiv) -------------- + try: + note_pl = make_note_payload( + parsed, + vault_root=root, + hash_mode=args.hash_mode, + hash_normalize=args.hash_normalize, + hash_source=args.hash_source, + file_path=path, + ) + except Exception as e: + print(json.dumps({"path": path, "note_id": fm.get("id"), "error": f"make_note_payload failed: {e}"})) + continue - if "fulltext" not in (note_pl or {}): - note_pl["fulltext"] = parsed.body or "" + if not isinstance(note_pl, dict): + print(json.dumps({"path": path, "note_id": fm.get("id"), "error": "make_note_payload returned non-dict"})) + continue + + # fulltext sicherstellen + Pfad normalisieren + if not note_pl.get("fulltext"): + note_pl["fulltext"] = getattr(parsed, "body", "") or "" if note_pl.get("path"): note_pl["path"] = _normalize_rel_path( os.path.join(root, note_pl["path"]) if not os.path.isabs(note_pl["path"]) else note_pl["path"], root ) else: - note_pl["path"] = _normalize_rel_path(parsed.path, root) + p_path = getattr(parsed, "path", None) or path + note_pl["path"] = _normalize_rel_path(p_path, root) - note_id = note_pl["note_id"] + note_id = note_pl.get("note_id") or fm.get("id") + if not note_id: + print(json.dumps({"path": path, "error": "Missing note_id after payload build"})) + continue - # Change-Detection (nur Inhalte, keine FS-Timestamps) + # -------------- Change-Detection -------------- old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id) old_hash = None if not old_payload else old_payload.get("hash_fulltext") old_sig = (old_payload or {}).get("hash_signature") new_hash = note_pl.get("hash_fulltext") new_sig = note_pl.get("hash_signature") - # 1) Signaturwechsel → als geändert behandeln (Einmal-Update) sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3]) - - # 2) Hash-Vergleich hash_changed = (old_hash != new_hash) - # 3) Optional: Parsed-Text direkt vergleichen (zusätzlich) text_changed = False if compare_text: old_text = (old_payload or {}).get("fulltext") or "" @@ -234,49 +255,41 @@ def main() -> None: changed = args.force_replace or sig_changed or hash_changed or text_changed - # Optionales Debugging: kompakten Diff anzeigen - if args.debug_hash_diff: - old_text = (old_payload or {}).get("fulltext") or "" - new_text = note_pl.get("fulltext") or "" - # Hinweis, wenn Hash gleich aber Text verschieden (oder Signaturwechsel) - if (not hash_changed) and (old_text != new_text or sig_changed): - print(json.dumps({ - "debug": "hash_equal_but_text_or_signature_differs", - "note_id": note_id, - "old_sig": old_sig, - "new_sig": new_sig, - "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"), - "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"), - "hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"), - }, ensure_ascii=False)) - if old_text and new_text: - ud = list(difflib.unified_diff( - old_text.splitlines(), new_text.splitlines(), - fromfile="qdrant_fulltext(old)", tofile="vault_body(new)", - n=3 - )) - if ud: - preview = "\n".join(ud[:50]) - print(json.dumps({"note_id": note_id, "diff_preview": preview}, ensure_ascii=False)) + # -------------- Chunks / Embeddings / Edges -------------- + try: + chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept")) + except Exception as e: + print(json.dumps({"path": path, "note_id": note_id, "error": f"assemble_chunks failed: {e}"})) + continue + + try: + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) + except Exception as e: + print(json.dumps({"path": path, "note_id": note_id, "error": f"make_chunk_payloads failed: {e}"})) + continue - # Chunks + Embeddings (Nullvektor-Fallback) - chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) - chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) if embed_texts: - vecs = embed_texts([getattr(c, "text", "") for c in chunks]) # type: ignore[attr-defined] + try: + vecs = embed_texts([getattr(c, "text", "") for c in chunks]) # type: ignore[attr-defined] + except Exception as e: + print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"})) + vecs = [[0.0] * cfg.dim for _ in chunks] else: vecs = [[0.0] * cfg.dim for _ in chunks] - # Edges (neues Schema, mit note_id als Owner) - note_refs = note_pl.get("references") or [] - edges = build_edges_for_note( - note_id, - chunk_pls, - note_refs, - include_note_scope_refs=note_scope_refs, - ) + try: + note_refs = note_pl.get("references") or [] + edges = build_edges_for_note( + note_id, + chunk_pls, + note_refs, + include_note_scope_refs=note_scope_refs, + ) + except Exception as e: + print(json.dumps({"path": path, "note_id": note_id, "error": f"build_edges_for_note failed: {e}"})) + continue - # Zusammenfassung pro Datei + # -------------- Zusammenfassung / Ausgabe -------------- summary = { "note_id": note_id, "title": fm.get("title"), @@ -297,21 +310,25 @@ def main() -> None: } print(json.dumps(summary, ensure_ascii=False)) + # -------------- Upserts -------------- if not args.apply: continue - if changed and args.purge_before_upsert: - purge_note_artifacts(client, cfg.prefix, note_id) + try: + if changed and args.purge_before_upsert: + purge_note_artifacts(client, cfg.prefix, note_id) + except Exception as e: + print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"})) - # Upserts - notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) - upsert_batch(client, notes_name, note_pts) - - chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs) - upsert_batch(client, chunks_name, chunk_pts) - - edges_name, edge_pts = points_for_edges(cfg.prefix, edges) - upsert_batch(client, edges_name, edge_pts) + try: + notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) + upsert_batch(client, notes_name, note_pts) + chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs) + upsert_batch(client, chunks_name, chunk_pts) + edges_name, edge_pts = points_for_edges(cfg.prefix, edges) + upsert_batch(client, edges_name, edge_pts) + except Exception as e: + print(json.dumps({"path": path, "note_id": note_id, "error": f"upsert failed: {e}"})) print(f"Done. Processed notes: {processed}")