#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ FILE: scripts/export_markdown.py VERSION: 2.1.0 (2025-12-15) STATUS: Active COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b) Zweck: ------- Exportiert Notes (Frontmatter + Body) aus Qdrant zurück in Markdown-Dateien. Nützlich für Backup, Migration oder Notfall-Wiederherstellung. Funktionsweise: --------------- 1. Liest alle Notes aus Qdrant (Collection: {prefix}_notes) 2. Für jede Note: - Rekonstruiert Frontmatter aus Payload-Feldern - Rekonstruiert Body: * Primär: aus `fulltext` Feld (falls vorhanden) * Fallback: aus Chunks (sortiert nach seq → chunk_index → chunk_id) - Optional: Fügt Edges als Links hinzu (yaml/footer) 3. Schreibt Markdown-Dateien mit relativen Pfaden Ergebnis-Interpretation: ------------------------ - Ausgabe: JSON pro Datei mit note_id, path, decision (write/skip-exists) - Abschluss: "Done. Exported notes: X" - Exit-Code 0: Erfolgreich Verwendung: ----------- - Backup vor größeren Änderungen - Migration zwischen Instanzen - Notfall-Wiederherstellung bei Vault-Verlust - Analyse der in Qdrant gespeicherten Daten Hinweise: --------- - Pfade werden relativ geschrieben (wie im Original-Vault) - --flatten-paths: Alle Dateien in einen Ordner (orig_path in Frontmatter) - Body-Rekonstruktion aus Chunks kann bei sehr langen Dokumenten unvollständig sein - Edge-Export optional (none/yaml/footer) Aufruf: ------- python3 -m scripts.export_markdown --out ./_exportVault python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite Parameter: ---------- --out PATH Zielordner für exportierte Markdown-Dateien (erforderlich) --prefix TEXT Collection-Präfix (überschreibt ENV COLLECTION_PREFIX) --note-id ID Nur eine bestimmte Note-ID exportieren (optional) --overwrite Existierende Dateien überschreiben (sonst skip) --include-edges MODE none|yaml|footer (Default: none) - none: Keine Edges - yaml: Als YAML-Feld 'references' im Frontmatter - footer: Als Markdown-Links am Ende --flatten-paths Alle Dateien flach schreiben (orig_path in Frontmatter) Umgebungsvariablen: ------------------- COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY Änderungen: ----------- v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung - Aktualisiert: Import-Pfade für neue Struktur v1.4.1 (2025-09-10): Initial Release """ from __future__ import annotations import argparse import os import json from typing import Dict, List, Optional, Tuple, Any import yaml from qdrant_client.http import models as rest from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections # --------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------- def collections(prefix: str) -> Tuple[str, str, str]: return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" def _norm_rel_path(path: str) -> str: p = (path or "").replace("\\", "/").lstrip("/") return p if p else "" def _ensure_dir(path: str) -> None: d = os.path.dirname(path) if d and not os.path.exists(d): os.makedirs(d, exist_ok=True) def _yaml_dump(data: Dict[str, Any]) -> str: return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip() def _frontmatter_block(fm: Dict[str, Any]) -> str: y = _yaml_dump(fm) return f"---\n{y}\n---\n" def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]: out = [] nextp = None while True: pts, nextp = client.scroll( collection_name=collection, with_payload=True, with_vectors=False, limit=256, scroll_filter=flt, offset=nextp, ) if not pts: break out.extend(pts) if nextp is None: break return out def _reconstruct_body_from_chunks(chunks: List[Any]) -> str: def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]: s = pl.get("seq") ci = pl.get("chunk_index") cid = pl.get("chunk_id") or "" n = 0 if isinstance(cid, str) and "#" in cid: try: n = int(cid.rsplit("#", 1)[-1]) except Exception: n = 0 return (int(s) if isinstance(s, int) else 0, int(ci) if isinstance(ci, int) else 0, n) chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {})) texts: List[str] = [] for p in chunks_sorted: pl = p.payload or {} t = pl.get("text") or pl.get("content") or pl.get("raw") or "" if isinstance(t, str) and t: texts.append(t) return "\n".join(texts).strip() def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]: refs = [] for p in edges: pl = p.payload or {} if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str): refs.append(pl["target_id"]) # de-dupe, preserve order seen = set() out = [] for r in refs: if r not in seen: seen.add(r) out.append(r) return out # --------------------------------------------------------------------- # Main # --------------------------------------------------------------------- def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien") ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)") ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren") ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben") ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none", help="Forward-Links mit exportieren (aus Edges oder Note-Payload)") ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)") args = ap.parse_args() cfg = QdrantConfig.from_env() if args.prefix: cfg.prefix = args.prefix.strip() client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) out_root = os.path.abspath(args.out) os.makedirs(out_root, exist_ok=True) notes_col, chunks_col, edges_col = collections(cfg.prefix) # Filter nach note-id (optional) flt = None if args.note_id: flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))]) notes = _scroll_all(client, notes_col, flt) total = 0 for n in notes: pl = n.payload or {} nid = pl.get("note_id") rel_path = _norm_rel_path(pl.get("path") or "") if args.flatten_paths or not rel_path: fname = f"{(nid or 'note')}.md" out_path = os.path.join(out_root, fname) else: out_path = os.path.join(out_root, rel_path) out_path = out_path.replace("\\", "/") _ensure_dir(out_path) # Frontmatter aufbauen (nur sinnvolle Felder) fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"] fm: Dict[str, Any] = {} fm["id"] = nid for k in fm_fields: if k == "id": continue if k in pl and pl[k] is not None: fm[k] = pl[k] if args.flatten_paths and rel_path: fm["orig_path"] = rel_path # Body ermitteln (fulltext oder Chunks) body = (pl.get("fulltext") or "").strip() if not body: flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) chunks = _scroll_all(client, chunks_col, flt_chunks) body = _reconstruct_body_from_chunks(chunks) # Edges (optional) refs: List[str] = [] if args.include_edges != "none": # aus Note-Payload, falls vorhanden if isinstance(pl.get("references"), list) and pl["references"]: refs = [r for r in pl["references"] if isinstance(r, str)] else: flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) edges = _scroll_all(client, edges_col, flt_edges) refs = _collect_forward_refs_from_edges(edges) if args.include_edges == "yaml" and refs: fm["references"] = refs # Datei schreiben if (not args.overwrite) and os.path.exists(out_path): print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"})) continue content = _frontmatter_block(fm) + (body + "\n" if body else "") if args.include_edges == "footer" and refs: content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n" with open(out_path, "w", encoding="utf-8") as f: f.write(content) print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"})) total += 1 print(f"Done. Exported notes: {total}") if __name__ == "__main__": main()