#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Script: scripts/import_markdown.py Version: 3.9.3 Date: 2025-11-08 Zweck ----- Importer für Obsidian-Markdown-Notizen in Qdrant: - Einlesen (Frontmatter/Body) - Chunking (unterstützt alte und neue Chunk-Pipelines) - Edges ableiten (kompatibel zu alten derive_edges-Signaturen) - Hash-Detektion (ENV-gesteuert) - Upsert Notes/Chunks/Edges (inkl. Notes-Vector, falls Collection Vektor verlangt) Kompatibilität -------------- - Parser mit/ohne `body_full` - `make_chunk_payloads(parsed, note_pl, chunks)` ODER ältere Signaturen - `build_edges_for_note(parsed, chunks)` ODER neue Signaturen (optional mit note_scope_refs) - Qdrant-Collections mit/ohne Vektorerwartung; Notes erhalten notfalls einen Zero-Vector - Prefix-Auflösung: CLI --prefix > COLLECTION_PREFIX > MINDNET_PREFIX > "mindnet" ENV (Hash-Steuerung) -------------------- MINDNET_HASH_COMPARE : Body | Frontmatter | Full (default: Body) MINDNET_HASH_SOURCE : parsed | raw (default: parsed) MINDNET_HASH_NORMALIZE : canonical | none (default: canonical) Weitere ENV ----------- MINDNET_NOTE_VECTOR_D : Dimension des Note-Vektors (default: aus QdrantConfig oder 384) """ from __future__ import annotations import argparse import inspect import json import os import sys from pathlib import Path from typing import Dict, List, Optional, Tuple # Parser & Payloads from app.core.parser import read_markdown from app.core.note_payload import make_note_payload from app.core.chunk_payload import make_chunk_payloads from app.core.derive_edges import build_edges_for_note # Qdrant Helper from app.core.qdrant import ( QdrantConfig, get_client, ensure_collections, count_points, ) from app.core.qdrant_points import ( upsert_notes, upsert_chunks, upsert_edges, delete_by_note, ) # Optionales Chunk-Assembly (neuere Pipeline) try: from app.core.chunker import assemble_chunks # bevorzugt except Exception: # pragma: no cover assemble_chunks = None # ---------------------------- # Utilities # ---------------------------- def _env(key: str, default: str = "") -> str: v = os.environ.get(key, "") return v if v != "" else default def _hash_mode() -> str: m = _env("MINDNET_HASH_COMPARE", "Body").lower() return m if m in ("body", "frontmatter", "full") else "body" def _hash_source() -> str: s = _env("MINDNET_HASH_SOURCE", "parsed").lower() return s if s in ("parsed", "raw") else "parsed" def _hash_normalize() -> str: n = _env("MINDNET_HASH_NORMALIZE", "canonical").lower() return n if n in ("canonical", "none") else "canonical" def _safe_text(parsed) -> str: """Bevorzugt parsed.body_full, sonst parsed.body.""" return getattr(parsed, "body_full", None) or getattr(parsed, "body", "") or "" def _load_prefix(arg_prefix: Optional[str]) -> str: if arg_prefix and arg_prefix.strip(): return arg_prefix.strip() env_prefix = os.environ.get("COLLECTION_PREFIX") or os.environ.get("MINDNET_PREFIX") return (env_prefix or "mindnet").strip() def _print(obj): sys.stdout.write(json.dumps(obj, ensure_ascii=False) + "\n") sys.stdout.flush() def _iter_md(vault: Path) -> List[Path]: out: List[Path] = [] for p in sorted(vault.rglob("*.md")): if p.is_file(): out.append(p) return out def _note_vector_dim(cfg: QdrantConfig) -> int: # Bevorzugt Konfig, sonst ENV, sonst 384 # Viele Setups nutzen 384 (MiniLM 384d) dim = getattr(cfg, "notes_vector_dim", None) if isinstance(dim, int) and dim > 0: return dim env_dim = _env("MINDNET_NOTE_VECTOR_D", "") if env_dim.isdigit(): try: d = int(env_dim) if d > 0: return d except Exception: pass return 384 def _ensure_note_vector(note_pl: Dict, cfg: QdrantConfig) -> None: # Falls die Notes-Collection einen dichten Vektor verlangt, muss `vector` gesetzt sein. # Wir setzen einen Zero-Vector mit korrekter Dimension. if "vector" not in note_pl or note_pl["vector"] is None: d = _note_vector_dim(cfg) note_pl["vector"] = [0.0] * d # ---------------------------- # Signatur-kompatible Aufrufe # ---------------------------- def _call_make_chunk_payloads(parsed, note_pl, raw_chunks: Optional[List[Dict]] = None) -> List[Dict]: """ Ruft make_chunk_payloads mit der passenden Signatur auf. Historisch gab es Varianten: A) make_chunk_payloads(parsed, note_pl, chunks) B) make_chunk_payloads(parsed, note_pl) C) make_chunk_payloads(note_pl, chunks) Wir erkennen das zur Laufzeit. """ sig = inspect.signature(make_chunk_payloads) params = list(sig.parameters.keys()) # Versuche die plausibelste moderne Variante zuerst try_order = [] if params[:3] == ["parsed", "note_pl", "chunks"]: try_order = [("parsed_note_chunks",)] elif params[:2] == ["parsed", "note_pl"]: try_order = [("parsed_note",)] elif params[:2] == ["note_pl", "chunks"]: try_order = [("note_chunks",)] else: # generischer Fallback: wir probieren die drei Muster try_order = [("parsed_note_chunks",), ("parsed_note",), ("note_chunks",)] last_err = None for variant in try_order: try: if variant == ("parsed_note_chunks",): if raw_chunks is None: # wenn Signatur die Chunks erwartet, aber keine vorhanden sind, baue konservativ 1 Chunk raw_chunks = [{ "chunk_id": f"{note_pl.get('note_id', 'note')}#1", "text": _safe_text(parsed), "window": _safe_text(parsed), "order": 1, "path": note_pl.get("path", ""), }] return make_chunk_payloads(parsed, note_pl, raw_chunks) # type: ignore elif variant == ("parsed_note",): return make_chunk_payloads(parsed, note_pl) # type: ignore elif variant == ("note_chunks",): if raw_chunks is None: raw_chunks = [{ "chunk_id": f"{note_pl.get('note_id', 'note')}#1", "text": _safe_text(parsed), "window": _safe_text(parsed), "order": 1, "path": note_pl.get("path", ""), }] return make_chunk_payloads(note_pl, raw_chunks) # type: ignore except Exception as e: last_err = e raise RuntimeError(f"make_chunk_payloads invocation failed: {last_err}") def _call_build_edges_for_note(parsed, chunk_payloads: List[Dict], note_scope_refs: bool) -> List[Dict]: """ Ruft build_edges_for_note mit kompatibler Signatur auf. Historisch: A) build_edges_for_note(parsed, chunks) B) build_edges_for_note(parsed, chunks, note_scope_refs=True/False) """ sig = inspect.signature(build_edges_for_note) params = list(sig.parameters.keys()) try: if "note_scope_refs" in params: return build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs) # type: ignore else: return build_edges_for_note(parsed, chunk_payloads) # type: ignore except TypeError: # strenger Fallback: ohne Zusatzparameter return build_edges_for_note(parsed, chunk_payloads) # type: ignore # ---------------------------- # Hauptverarbeitung # ---------------------------- def process_file( path: Path, cfg: QdrantConfig, note_scope_refs: bool, apply: bool, purge_before_upsert: bool, ) -> Tuple[Optional[dict], List[dict], List[dict]]: try: parsed = read_markdown(str(path)) except Exception as e: _print({"path": str(path), "error": f"read_markdown failed: {e.__class__.__name__}: {e}"}) return None, [], [] # Note-Payload try: note_pl = make_note_payload(parsed, vault_root=str(path.parent.parent)) if not isinstance(note_pl, dict): note_pl = { "note_id": parsed.frontmatter.get("id") or path.stem, "title": parsed.frontmatter.get("title") or path.stem, "status": parsed.frontmatter.get("status", "unknown"), "path": str(path).replace("\\", "/"), "tags": parsed.frontmatter.get("tags", []), } note_pl["fulltext"] = _safe_text(parsed) note_pl["hash_signature"] = f"{_hash_mode()}:{_hash_source()}:{_hash_normalize()}" # Notes-Vector sicherstellen (Zero-Vector, wenn Collection ihn verlangt) _ensure_note_vector(note_pl, cfg) except Exception as e: _print({"path": str(path), "error": f"make_note_payload failed: {e}"}) return None, [], [] # Roh-Chunks (falls assemble_chunks verfügbar) raw_chunks: Optional[List[Dict]] = None if assemble_chunks is not None: try: raw_chunks = assemble_chunks( note_pl.get("note_id", path.stem), _safe_text(parsed), parsed.frontmatter.get("type", "concept"), ) except Exception as e: _print({"path": str(path), "note_id": note_pl.get("note_id"), "warn": f"assemble_chunks failed: {e}"}) raw_chunks = None # Chunk-Payloads try: chunk_payloads = _call_make_chunk_payloads(parsed, note_pl, raw_chunks) if not isinstance(chunk_payloads, list): chunk_payloads = [] except Exception as e: _print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"make_chunk_payloads failed: {e}"}) chunk_payloads = [] # Edges try: edges = _call_build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs) except Exception as e: _print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"build_edges_for_note failed: {e}"}) edges = [] return note_pl, chunk_payloads, edges def main(): ap = argparse.ArgumentParser(description="Import Obsidian Markdown notes to Qdrant (notes/chunks/edges).") ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Wurzel).") ap.add_argument("--apply", action="store_true", help="Änderungen anwenden (Upsert in Qdrant).") ap.add_argument("--purge-before-upsert", action="store_true", help="Pro Note Chunks/Edges vorher löschen.") ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope Referenzen (falls unterstützt).") ap.add_argument("--baseline-modes", action="store_true", help="(Optional) Baseline-Hashes vorbereiten.") ap.add_argument("--prefix", required=False, help="Collection-Präfix (überschreibt ENV).") args = ap.parse_args() vault = Path(args.vault).resolve() if not vault.exists(): ap.error(f"Vault nicht gefunden: {vault}") prefix = _load_prefix(args.prefix) cfg = QdrantConfig.from_env(prefix=prefix) client = get_client(cfg) ensure_collections(client, cfg) files = _iter_md(vault) if not files: _print({"summary": "done", "processed": 0, "prefix": cfg.prefix}) return if args.baseline_modes: _print({"action": "baseline", "modes": ["body", "frontmatter", "full"], "source": _hash_source(), "norm": _hash_normalize()}) processed = 0 for p in files: note_pl, chunk_payloads, edges = process_file( p, cfg, note_scope_refs=args.note_scope_refs, apply=args.apply, purge_before_upsert=args.purge_before_upsert, ) if not note_pl: continue info = { "note_id": note_pl.get("note_id"), "title": note_pl.get("title"), "chunks": len(chunk_payloads), "edges": len(edges), "changed": True, # Detail-Hashing passiert innerhalb der Payload-Builder "decision": "apply" if args.apply else "dry-run", "path": str(p.relative_to(vault)).replace("\\", "/"), "hash_mode": _hash_mode(), "hash_normalize": _hash_normalize(), "hash_source": _hash_source(), "prefix": cfg.prefix, } if args.apply: if args.purge_before_upsert: try: delete_by_note(client, cfg, note_pl.get("note_id", "")) except Exception as e: _print({"note_id": note_pl.get("note_id"), "warn": f"delete_by_note failed: {e}"}) try: upsert_notes(client, cfg, [note_pl]) except Exception as e: _print({"note_id": note_pl.get("note_id"), "error": f"upsert_notes failed: {e}"}) if chunk_payloads: try: upsert_chunks(client, cfg, chunk_payloads) except Exception as e: _print({"note_id": note_pl.get("note_id"), "error": f"upsert_chunks failed: {e}"}) if edges: try: upsert_edges(client, cfg, edges) except Exception as e: _print({"note_id": note_pl.get("note_id"), "error": f"upsert_edges failed: {e}"}) _print(info) processed += 1 _print({ "summary": "done", "processed": processed, "prefix": cfg.prefix, "collections": {"notes": cfg.notes, "chunks": cfg.chunks, "edges": cfg.edges}, "counts": count_points(client, cfg), }) if __name__ == "__main__": main()