diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index a151810..97b68a3 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -2,29 +2,27 @@ # -*- coding: utf-8 -*- """ Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges) -Version: 3.5.2 +Version: 3.5.3 Datum: 2025-09-09 +Neu in 3.5.3 +------------ +- **Fallback**, falls app.core.note_payload.make_note_payload(None) / non-dict zurückgibt: + Der Importer baut einen minimalen, korrekten Note-Payload selbst (inkl. Hash + gemäß --hash-mode/--hash-source/--hash-normalize) und läuft weiter. +- Detaillierte Debug-Ausgaben mit Typ/Preview, falls die Rückgabeform abweicht. + Kurzbeschreibung ---------------- - Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges **idempotent**. -- **Änderungserkennung** (nur Inhalte, keine FS-Zeitstempel) konfigurierbar: - * ``--hash-mode``: body | frontmatter | body+frontmatter | full (Alias) - - Env: ``MINDNET_HASH_MODE`` **oder** ``MINDNET_HASH_COMPARE`` (Body/Frontmatter/Full) - * ``--hash-normalize``: canonical | none (Default: canonical) - * ``--hash-source``: parsed (Default) | raw - - "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt). - * Optional: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``) - - parsed ``fulltext`` zusätzlich direkt vergleichen (falls Normalisierung Unterschiede glättet). - * Signaturabgleich: - - Wenn sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet, - gilt die Note als **geändert** (Einmal-Update, um die neue Signatur zu persistieren). - -Robustheit ----------- -- Rückgaben aus ``make_note_payload`` werden **koerziert** (Tuple, Mapping, Pydantic v1/v2, Objekt) → ``dict``. -- Bei Nicht-Erfolg präzise Debug-Ausgabe (Typname + kurzer Preview). -- Defensive Fehlerbehandlung in allen Schritten (Parsing, Chunks, Edges, Upserts). +- Änderungserkennung (nur Inhalte, keine FS-Zeitstempel) konfigurierbar: + * --hash-mode: body | frontmatter | body+frontmatter | full (Alias) + - Env: MINDNET_HASH_MODE oder MINDNET_HASH_COMPARE (Body/Frontmatter/Full) + * --hash-normalize: canonical | none (Default: canonical) + * --hash-source: parsed (Default) | raw + * Optional: --compare-text (oder ENV MINDNET_COMPARE_TEXT=true) + * Signaturabgleich: Wechselt die Signatur (z. B. body→full, parsed→raw, canonical→none), + gilt die Note einmalig als geändert. ENV / Qdrant ------------ @@ -36,16 +34,9 @@ ENV / Qdrant Aufruf-Beispiele ---------------- - # Standard (nur Body, kanonisiert, parsed-Quelle) python3 -m scripts.import_markdown --vault ./vault - - # Sensibel (jede Kleinigkeit): raw-Quelle + keine Normalisierung python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none - - # Full-Vergleich (Body+Frontmatter) MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply - - # Zusätzlich Body-Text direkt vergleichen (max. Sicherheit) python3 -m scripts.import_markdown --vault ./vault --apply --compare-text """ from __future__ import annotations @@ -65,6 +56,8 @@ from app.core.parser import ( read_markdown, normalize_frontmatter, validate_required_frontmatter, + extract_wikilinks, + FRONTMATTER_RE, ) from app.core.note_payload import make_note_payload from app.core.chunker import assemble_chunks @@ -137,17 +130,100 @@ def _normalize_rel_path(abs_path: str, vault_root: str) -> str: rel = abs_path return rel.replace("\\", "/").lstrip("/") +def _resolve_hash_mode(val: Optional[str]) -> str: + v = (val or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower() + if v in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"): + return "body+frontmatter" + if v in ("frontmatter", "fm"): + return "frontmatter" + return "body" + +def _normalize_body(body: str, mode: str) -> str: + if mode == "none": + return body if body is not None else "" + text = (body or "").replace("\r\n", "\n").replace("\r", "\n") + text = "\n".join(line.rstrip() for line in text.split("\n")) + return text + +def _canon_fm(fm: Dict[str, Any]) -> str: + return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True) + +def _read_raw_body(file_path: str) -> tuple[str, Dict[str, Any]]: + if not os.path.exists(file_path): + return "", {} + with open(file_path, "r", encoding="utf-8") as f: + raw = f.read() + m = FRONTMATTER_RE.match(raw) + fm = {} + if m: + import yaml + try: + fm = yaml.safe_load(m.group(1)) or {} + except Exception: + fm = {} + body = raw[m.end():] + else: + body = raw + return body, fm + +def _fallback_note_payload(parsed: Any, vault_root: str, *, hash_mode: Optional[str], hash_normalize: Optional[str], + hash_source: Optional[str], file_path: str) -> Dict[str, Any]: + """Baut einen minimalen Note-Payload, falls make_note_payload None liefert.""" + fm = normalize_frontmatter(parsed.frontmatter) + path = getattr(parsed, "path", file_path) + rel_path = _normalize_rel_path(path, vault_root) + src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() + norm = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() + mode = _resolve_hash_mode(hash_mode) + + if src == "raw": + body_raw, fm_raw = _read_raw_body(file_path) + # Roh-FM ergänzen (nicht überschreiben) + if fm_raw: + merged = dict(fm) + for k, v in fm_raw.items(): + merged.setdefault(k, v) + fm = merged + body_for_hash = body_raw + else: + body_for_hash = getattr(parsed, "body", "") or "" + + body_n = _normalize_body(body_for_hash, norm) + fm_s = _canon_fm(fm) + + import hashlib + def h(s: str) -> str: + x = hashlib.sha256() + x.update(s.encode("utf-8")) + return x.hexdigest() + + if mode == "frontmatter": + primary_hash = h(fm_s) + elif mode == "body+frontmatter": + primary_hash = h(body_n + "\n--FM--\n" + fm_s) + else: + primary_hash = h(body_n) + + sig = f"{'full' if mode=='body+frontmatter' else mode}:{src}:{norm}:{primary_hash}" + + refs = extract_wikilinks(getattr(parsed, "body", "") or "") + + return { + "note_id": fm.get("id") or fm.get("note_id"), + "title": fm.get("title"), + "type": fm.get("type"), + "status": fm.get("status"), + "created": fm.get("created"), + "updated": fm.get("updated"), + "path": rel_path or fm.get("path"), + "tags": fm.get("tags"), + "hash_fulltext": primary_hash, + "hash_signature": sig, + "fulltext": getattr(parsed, "body", "") or "", + "references": list(dict.fromkeys(refs)), + } + def _coerce_to_dict(obj: Any) -> Optional[Dict[str, Any]]: - """ - Versucht, verschiedenartige Rückgaben (Mapping, Tuple, Pydantic, Objekt) in ein dict zu konvertieren. - - dict → dict - - Mapping → dict(obj) - - (dict, ...) oder [dict, ...] → erster dict-ähnlicher Eintrag - - Pydantic v2: .model_dump() - - Pydantic v1: .dict() - - Objekt mit __dict__ → dict(__dict__) - - sonst: None - """ if obj is None: return None if isinstance(obj, dict): @@ -163,21 +239,14 @@ def _coerce_to_dict(obj: Any) -> Optional[Dict[str, Any]]: if isinstance(d, dict): return d return None - # Pydantic v2 md = getattr(obj, "model_dump", None) if callable(md): - try: - return md() - except Exception: - pass - # Pydantic v1 + try: return md() + except Exception: pass dd = getattr(obj, "dict", None) if callable(dd): - try: - return dd() - except Exception: - pass - # generischer Fallback + try: return dd() + except Exception: pass dct = getattr(obj, "__dict__", None) if isinstance(dct, dict): return dict(dct) @@ -199,22 +268,16 @@ def main() -> None: ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten") ap.add_argument("--force-replace", action="store_true", help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)") - ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None, - help="Vergleichsmodus: Body | Frontmatter | body+frontmatter (Alias: full)") + ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None) ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None) - ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None, - help="Quelle für die Hash-Berechnung (Default: parsed)") - ap.add_argument("--note-scope-refs", action="store_true", - help="(Optional) erzeugt zusätzlich references:note (Default: aus)") - ap.add_argument("--debug-hash-diff", action="store_true", - help="Zeigt bei Bedarf einen kurzen Diff zwischen altem und neuem Body") - ap.add_argument("--compare-text", action="store_true", - help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)") + ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None) + ap.add_argument("--note-scope-refs", action="store_true") + ap.add_argument("--debug-hash-diff", action="store_true") + ap.add_argument("--compare-text", action="store_true") args = ap.parse_args() note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true") note_scope_refs = args.note_scope_refs or note_scope_refs_env - compare_text = args.compare_text or (os.environ.get("MINDNET_COMPARE_TEXT", "false").strip().lower() == "true") cfg = QdrantConfig.from_env() @@ -230,7 +293,7 @@ def main() -> None: processed = 0 for path in files: - # ----------------- robustes Parsing ----------------- + # ------------- Parsing ------------- try: parsed = read_markdown(path) except Exception as e: @@ -242,11 +305,6 @@ def main() -> None: try: fm = normalize_frontmatter(parsed.frontmatter) - except Exception as e: - print(json.dumps({"path": path, "error": f"normalize_frontmatter failed: {e}"})) - continue - - try: validate_required_frontmatter(fm) except Exception as e: print(json.dumps({"path": path, "error": f"Frontmatter invalid: {e}"})) @@ -257,7 +315,7 @@ def main() -> None: processed += 1 - # -------------- Note-Payload (defensiv + Koerzierung) -------------- + # ------------- Note-Payload ------------- try: note_pl_raw = make_note_payload( parsed, @@ -269,23 +327,27 @@ def main() -> None: ) except Exception as e: print(json.dumps({"path": path, "note_id": fm.get("id"), "error": f"make_note_payload failed: {e}"})) - continue + note_pl_raw = None note_pl = _coerce_to_dict(note_pl_raw) if not isinstance(note_pl, dict): - preview = repr(note_pl_raw) - if len(preview) > 240: - preview = preview[:240] + "…" + # Fallback: baue minimalen Payload + note_pl = _fallback_note_payload( + parsed, + root, + hash_mode=args.hash_mode, + hash_normalize=args.hash_normalize, + hash_source=args.hash_source, + file_path=path, + ) print(json.dumps({ "path": path, "note_id": fm.get("id"), - "error": "make_note_payload returned non-dict", - "returned_type": type(note_pl_raw).__name__, - "preview": preview + "warn": "make_note_payload returned non-dict; used fallback", + "returned_type": type(note_pl_raw).__name__ if note_pl_raw is not None else "NoneType" })) - continue - # fulltext sicherstellen + Pfad normalisieren + # fulltext + path Sicherheit if not note_pl.get("fulltext"): note_pl["fulltext"] = getattr(parsed, "body", "") or "" if note_pl.get("path"): @@ -298,10 +360,10 @@ def main() -> None: note_id = note_pl.get("note_id") or fm.get("id") if not note_id: - print(json.dumps({"path": path, "error": "Missing note_id after payload build"})) + print(json.dumps({"path": path, "error": "Missing note_id after payload build (even in fallback)"})) continue - # -------------- Change-Detection -------------- + # ------------- Change-Detection ------------- old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id) old_hash = None if not old_payload else old_payload.get("hash_fulltext") old_sig = (old_payload or {}).get("hash_signature") @@ -319,17 +381,12 @@ def main() -> None: changed = args.force_replace or sig_changed or hash_changed or text_changed - # -------------- Chunks / Embeddings / Edges -------------- + # ------------- Chunks / Embeddings / Edges ------------- try: chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept")) - except Exception as e: - print(json.dumps({"path": path, "note_id": note_id, "error": f"assemble_chunks failed: {e}"})) - continue - - try: chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) except Exception as e: - print(json.dumps({"path": path, "note_id": note_id, "error": f"make_chunk_payloads failed: {e}"})) + print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {e}"})) continue if embed_texts: @@ -353,7 +410,7 @@ def main() -> None: print(json.dumps({"path": path, "note_id": note_id, "error": f"build_edges_for_note failed: {e}"})) continue - # -------------- Zusammenfassung / Ausgabe -------------- + # ------------- Summary ------------- summary = { "note_id": note_id, "title": fm.get("title"), @@ -374,7 +431,7 @@ def main() -> None: } print(json.dumps(summary, ensure_ascii=False)) - # -------------- Upserts -------------- + # ------------- Upserts ------------- if not args.apply: continue