#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Script: scripts/import_markdown.py Version: 0.6.0 (2025-09-06) Autor: mindnet / Architektur Datenimporte & Sync Kurzbeschreibung --------------- Importiert Markdown-Notizen aus einem Obsidian-ähnlichen Vault in Qdrant: - Validiert Frontmatter / Note-Payload. - Chunking + Embeddings. - Leitet Edges direkt beim Import aus [[Wikilinks]] ab: - 'references' (Note→Note) - 'references_at' (Chunk→Note) - 'backlink' (Note←Note) nur für Note→Note-Kanten. Neu in 0.6.0 ------------ - Option `--purge-before-upsert`: löscht für die jeweils verarbeitete Note *vor* dem Upsert alle zugehörigen Chunks und Edges in Qdrant (selektiv!), um Leichen nach Re-Chunking zu vermeiden. - Robuste Link-Auflösung via Note-Index (ID / Titel-Slug / Datei-Slug) konsistent zu `derive_edges.py`. Aufrufbeispiele --------------- Dry-Run (keine Schreibzugriffe): python3 -m scripts.import_markdown --vault ./vault Nur eine bestimmte Note: python3 -m scripts.import_markdown --vault ./vault --note-id 20250821-foo Apply (schreiben) mit Purge: python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert Parameter --------- --vault PATH : Pflicht. Root-Verzeichnis des Vaults. --apply : Wenn gesetzt, werden Upserts durchgeführt (sonst Dry-Run). --purge-before-upsert : Wenn gesetzt, werden vor dem Upsert (nur bei --apply) alte Chunks und Edges dieser Note in Qdrant gelöscht. --note-id ID : Optional, verarbeitet nur diese eine Note. Umgebungsvariablen (.env) ------------------------- QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM Standard: url=http://127.0.0.1:6333, prefix=mindnet, dim=384 Kompatibilität -------------- - Nutzt die bestehenden Kernmodule: app.core.parser (read_markdown, normalize_frontmatter, validate_required_frontmatter) app.core.validate_note (validate_note_payload) app.core.chunker (assemble_chunks) app.core.chunk_payload (make_chunk_payloads) app.core.embed (embed_texts) app.core.qdrant (QdrantConfig, get_client, ensure_collections) app.core.qdrant_points (points_for_note, points_for_chunks, points_for_edges, upsert_batch) app.core.derive_edges (build_note_index, derive_wikilink_edges) Änderungshinweise vs. früherer Importer --------------------------------------- - Alte, globale Lösch-Workarounds entfallen. Selektives Purge ist jetzt optional und sicher. - Edges werden nur noch in der neuen, einheitlichen Struktur erzeugt. """ from __future__ import annotations import argparse import glob import json import os import sys from typing import List, Dict from dotenv import load_dotenv from qdrant_client.http import models as rest # Kern-Bausteine (vorhanden in eurem Projekt) from app.core.parser import ( read_markdown, normalize_frontmatter, validate_required_frontmatter, ) from app.core.validate_note import validate_note_payload from app.core.chunker import assemble_chunks from app.core.chunk_payload import make_chunk_payloads from app.core.embed import embed_texts from app.core.qdrant import QdrantConfig, ensure_collections, get_client, collection_names from app.core.qdrant_points import ( points_for_note, points_for_chunks, points_for_edges, upsert_batch, ) from app.core.derive_edges import build_note_index, derive_wikilink_edges # ------------------------------------------------- # Hilfsfunktionen # ------------------------------------------------- def iter_md(root: str) -> List[str]: patterns = ["**/*.md", "*.md"] out: List[str] = [] for p in patterns: out.extend(glob.glob(os.path.join(root, p), recursive=True)) return sorted(list(dict.fromkeys(out))) # de-dupe + sort def make_note_stub(abs_path: str, vault_root: str) -> Dict: """ Erstellt einen minimalen Note-Stub für den Index (build_note_index): { note_id, title, path } """ parsed = read_markdown(abs_path) fm = normalize_frontmatter(parsed.frontmatter or {}) # Validierung minimal: wir brauchen id + title (title optional für Slug-Auflösung) if "id" not in fm or not fm["id"]: raise ValueError(f"Missing id in frontmatter: {abs_path}") rel = os.path.relpath(abs_path, vault_root) return {"note_id": fm["id"], "title": fm.get("title"), "path": rel} def build_vault_index(vault_root: str) -> tuple[Dict, Dict, Dict]: """ Liest alle Noten ein und baut den Dreifach-Index für Wikilink-Auflösung. """ files = iter_md(vault_root) stubs = [] for p in files: try: stubs.append(make_note_stub(p, vault_root)) except Exception: # Notiz ohne id → wird vom Importer später ohnehin übersprungen continue return build_note_index(stubs) def purge_for_note(client, prefix: str, note_id: str, chunk_ids: List[str]) -> Dict[str, int]: """ Selektives Purge für die aktuelle Note: - Chunks: alle mit payload.note_id == note_id - Edges: alle mit payload.source_id == note_id ODER == einem der chunk_ids - Notes: werden nicht gelöscht (Upsert überschreibt Payload/Vektor) """ notes_col, chunks_col, edges_col = collection_names(prefix) counts = {"chunks_deleted": 0, "edges_deleted": 0} # Chunks löschen (Filter must: note_id == X) f_chunks = rest.Filter( must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))] ) res_chunks = client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True) counts["chunks_deleted"] = getattr(res_chunks, "status", None) and 0 or 0 # Qdrant liefert keine count hier # Edges löschen: OR über Note-ID und alle Chunk-IDs should_conds = [rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id))] for cid in chunk_ids: should_conds.append(rest.FieldCondition(key="source_id", match=rest.MatchValue(value=cid))) f_edges = rest.Filter(should=should_conds) if should_conds else None if f_edges is not None: client.delete(collection_name=edges_col, points_selector=f_edges, wait=True) return counts # ------------------------------------------------- # Main # ------------------------------------------------- def main(): ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True, help="Pfad zum Vault-Root") ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)") ap.add_argument( "--purge-before-upsert", action="store_true", help="Vor Upsert alte Chunks/Edges der aktuellen Note löschen (nur mit --apply wirksam).", ) ap.add_argument("--note-id", help="Optional: nur diese Note verarbeiten") args = ap.parse_args() load_dotenv() cfg = QdrantConfig( url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"), api_key=os.getenv("QDRANT_API_KEY", None), prefix=os.getenv("COLLECTION_PREFIX", "mindnet"), dim=int(os.getenv("VECTOR_DIM", "384")), ) client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) vault_root = os.path.abspath(args.vault) files = iter_md(vault_root) if not files: print("Keine Markdown-Dateien gefunden.", file=sys.stderr) sys.exit(2) # 1) Note-Index über den gesamten Vault (für robuste Link-Auflösung) note_index = build_vault_index(vault_root) processed = 0 for abs_path in files: parsed = read_markdown(abs_path) fm = normalize_frontmatter(parsed.frontmatter or {}) try: validate_required_frontmatter(fm) except Exception: # unvollständige Note überspringen continue if args.note_id and fm.get("id") != args.note_id: continue processed += 1 # --- Note-Payload --- from app.core.note_payload import make_note_payload # lazy import (bestehende Funktion) note_pl = make_note_payload(parsed, vault_root=vault_root) validate_note_payload(note_pl) # --- Chunking & Payloads --- chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) # --- Embeddings --- texts = [c.get("text") or c.get("content") or "" for c in chunk_pls] vectors = embed_texts(texts) # --- Edge-Ableitung (direkt) --- edges = derive_wikilink_edges(note_pl, chunk_pls, note_index) # --- Ausgabe je Note (Entscheidung) --- decision = "apply" if args.apply else "dry-run" # --- Purge vor Upsert (nur wenn --apply) --- if args.apply and args.purge_before_upsert: # Chunk-IDs (neu) ermitteln → für Edge-Purge by source_id chunk_ids = [] for i, ch in enumerate(chunk_pls, start=1): cid = ch.get("chunk_id") or ch.get("id") or f"{fm['id']}#{i}" ch["chunk_id"] = cid # sicherstellen chunk_ids.append(cid) purge_for_note(client, cfg.prefix, fm["id"], chunk_ids) # --- Upserts (nur bei --apply) --- if args.apply: # Note notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec=None, dim=cfg.dim) upsert_batch(client, notes_col, note_pts) # Chunks chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors) upsert_batch(client, chunks_col, chunk_pts) # Edges edges_col, edge_pts = points_for_edges(cfg.prefix, edges) upsert_batch(client, edges_col, edge_pts) # Logging pro Note print(json.dumps({ "note_id": fm["id"], "title": fm.get("title"), "chunks": len(chunk_pls), "edges": len(edges), "changed": True, # Hash-/Zeitvergleich kann optional hier ergänzt werden "decision": decision, "path": note_pl["path"], }, ensure_ascii=False)) print(f"Done. Processed notes: {processed}") if __name__ == "__main__": main()