diff --git a/scripts/export_markdown.py b/scripts/export_markdown.py index 0e41d0d..4f872a3 100644 --- a/scripts/export_markdown.py +++ b/scripts/export_markdown.py @@ -1,227 +1,180 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Script: scripts/export_markdown.py — Qdrant → Markdown (Vault) -Version: 1.4.1 -Datum: 2025-09-10 +Modul: scripts/export_markdown.py +Version: 1.6.1 +Datum: 2025-11-07 -Funktion --------- -Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird -bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks -(Sortierung: seq → chunk_index → Nummer in chunk_id). Pfade werden **relativ** geschrieben. +Zweck +----- +Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm): + • Pfade relativieren, Backslashes → Slashes + • Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index) + • Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer) -Optionen --------- ---out PATH Zielordner (erforderlich) ---prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX) ---note-id ID Nur eine Note exportieren ---overwrite Existierende Dateien überschreiben ---include-edges MODE none|yaml|footer (Default: none) ---flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path - -ENV +CLI --- -COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY - -Beispiele ---------- export COLLECTION_PREFIX="mindnet" python3 -m scripts.export_markdown --out ./_exportVault + python3 -m scripts.export_markdown --out ./_exportVault --note-id + python3 -m scripts.export_markdown --out ./_exportVault --overwrite + python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml + python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer - # Nur eine Note, Edges als YAML-Feld 'references' - python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml - - # Flach schreiben mit Überschreiben - python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite +Parameter +--------- +--out Zielwurzel (Ordner wird angelegt) +--prefix überschreibt ENV COLLECTION_PREFIX (Default: mindnet) +--note-id nur eine bestimmte Note exportieren +--overwrite vorhandene Dateien überschreiben +--include-edges none|yaml|footer (Default: none) """ from __future__ import annotations import argparse import os import json -from typing import Dict, List, Optional, Tuple, Any +from pathlib import Path +from typing import Dict, List, Tuple, Optional -import yaml -from qdrant_client.http import models as rest +from app.core.qdrant import ( + QdrantConfig, + get_client, + fetch_all_notes, + fetch_chunks_for_note, + fetch_edges_for_note, # <— jetzt angebunden + ensure_collections, +) -from app.core.qdrant import QdrantConfig, get_client -from app.core.qdrant import ensure_collections # safety +def _normalize_rel_path(p: str) -> str: + p = (p or "").replace("\\", "/") + while p.startswith("/"): + p = p[1:] + return p -# --------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------- +def _ensure_parent(p: Path): + p.parent.mkdir(parents=True, exist_ok=True) -def collections(prefix: str) -> Tuple[str, str, str]: - return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" +def _yaml_frontmatter(d: Dict) -> str: + import io + def _ser(obj): + if isinstance(obj, str): + if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]): + return '"' + obj.replace('"', '\\"') + '"' + return obj + if isinstance(obj, bool): + return "true" if obj else "false" + if obj is None: + return "null" + if isinstance(obj, (int, float)): + return str(obj) + if isinstance(obj, list): + return "[" + ", ".join(_ser(x) for x in obj) + "]" + if isinstance(obj, dict): + inner = [] + for k in sorted(obj.keys()): + inner.append(f"{k}: {_ser(obj[k])}") + return "{ " + ", ".join(inner) + " }" + return '"' + str(obj).replace('"', '\\"') + '"' -def _norm_rel_path(path: str) -> str: - p = (path or "").replace("\\", "/").lstrip("/") - return p if p else "" + buf = io.StringIO() + buf.write("---\n") + for k in sorted(d.keys()): + buf.write(f"{k}: {_ser(d[k])}\n") + buf.write("---\n") + return buf.getvalue() -def _ensure_dir(path: str) -> None: - d = os.path.dirname(path) - if d and not os.path.exists(d): - os.makedirs(d, exist_ok=True) - -def _yaml_dump(data: Dict[str, Any]) -> str: - return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip() - -def _frontmatter_block(fm: Dict[str, Any]) -> str: - y = _yaml_dump(fm) - return f"---\n{y}\n---\n" - -def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]: - out = [] - nextp = None - while True: - pts, nextp = client.scroll( - collection_name=collection, - with_payload=True, - with_vectors=False, - limit=256, - scroll_filter=flt, - offset=nextp, +def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str: + if not chunks: + return "" + def _num_from_chunk_id(cid: str) -> int: + try: + if "#" in cid: + return int(cid.split("#", 1)[1]) + return 0 + except Exception: + return 0 + chunks_sorted = sorted( + chunks, + key=lambda c: ( + int(c.get("seq", c.get("chunk_index", 0))), + int(c.get("chunk_index", 0)), + _num_from_chunk_id(str(c.get("chunk_id", ""))), ) - if not pts: - break - out.extend(pts) - if nextp is None: - break - return out + ) + body = "".join(c.get("text") or "" for c in chunks_sorted) + return body -def _reconstruct_body_from_chunks(chunks: List[Any]) -> str: - def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]: - s = pl.get("seq") - ci = pl.get("chunk_index") - cid = pl.get("chunk_id") or "" - n = 0 - if isinstance(cid, str) and "#" in cid: - try: - n = int(cid.rsplit("#", 1)[-1]) - except Exception: - n = 0 - return (int(s) if isinstance(s, int) else 0, - int(ci) if isinstance(ci, int) else 0, - n) - chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {})) - texts: List[str] = [] - for p in chunks_sorted: - pl = p.payload or {} - t = pl.get("text") or pl.get("content") or pl.get("raw") or "" - if isinstance(t, str) and t: - texts.append(t) - return "\n".join(texts).strip() +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.") + p.add_argument("--out", required=True, help="Zielordner") + p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX") + p.add_argument("--note-id", default="", help="nur eine Note exportieren") + p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben") + p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen") + return p.parse_args() -def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]: - refs = [] - for p in edges: - pl = p.payload or {} - if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str): - refs.append(pl["target_id"]) - # de-dupe, preserve order - seen = set() - out = [] - for r in refs: - if r not in seen: - seen.add(r) - out.append(r) - return out +def main(): + args = parse_args() + out_root = Path(args.out).resolve() + out_root.mkdir(parents=True, exist_ok=True) -# --------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------- - -def main() -> None: - ap = argparse.ArgumentParser() - ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien") - ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)") - ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren") - ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben") - ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none", - help="Forward-Links mit exportieren (aus Edges oder Note-Payload)") - ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)") - args = ap.parse_args() - - cfg = QdrantConfig.from_env() - if args.prefix: - cfg.prefix = args.prefix.strip() + prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet" + cfg = QdrantConfig.from_env(prefix=prefix) client = get_client(cfg) - ensure_collections(client, cfg.prefix, cfg.dim) + ensure_collections(client, cfg) - out_root = os.path.abspath(args.out) - os.makedirs(out_root, exist_ok=True) - - notes_col, chunks_col, edges_col = collections(cfg.prefix) - - # Filter nach note-id (optional) - flt = None if args.note_id: - flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))]) + notes = fetch_all_notes(client, cfg, only_note_id=args.note_id) + else: + notes = fetch_all_notes(client, cfg) - notes = _scroll_all(client, notes_col, flt) - total = 0 + exported = 0 for n in notes: - pl = n.payload or {} - nid = pl.get("note_id") - rel_path = _norm_rel_path(pl.get("path") or "") - if args.flatten_paths or not rel_path: - fname = f"{(nid or 'note')}.md" - out_path = os.path.join(out_root, fname) - else: - out_path = os.path.join(out_root, rel_path) - out_path = out_path.replace("\\", "/") - _ensure_dir(out_path) - - # Frontmatter aufbauen (nur sinnvolle Felder) - fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"] - fm: Dict[str, Any] = {} - fm["id"] = nid - for k in fm_fields: - if k == "id": - continue - if k in pl and pl[k] is not None: - fm[k] = pl[k] - if args.flatten_paths and rel_path: - fm["orig_path"] = rel_path - - # Body ermitteln (fulltext oder Chunks) - body = (pl.get("fulltext") or "").strip() - if not body: - flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) - chunks = _scroll_all(client, chunks_col, flt_chunks) - body = _reconstruct_body_from_chunks(chunks) - - # Edges (optional) - refs: List[str] = [] - if args.include_edges != "none": - # aus Note-Payload, falls vorhanden - if isinstance(pl.get("references"), list) and pl["references"]: - refs = [r for r in pl["references"] if isinstance(r, str)] - else: - flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) - edges = _scroll_all(client, edges_col, flt_edges) - refs = _collect_forward_refs_from_edges(edges) - if args.include_edges == "yaml" and refs: - fm["references"] = refs - - # Datei schreiben - if (not args.overwrite) and os.path.exists(out_path): - print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"})) + note_id = n.get("note_id") or n.get("id") + if not note_id: continue - content = _frontmatter_block(fm) + (body + "\n" if body else "") - if args.include_edges == "footer" and refs: - content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n" + rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md")) + dst = out_root.joinpath(rel) - with open(out_path, "w", encoding="utf-8") as f: - f.write(content) + body = str(n.get("fulltext") or "") + if not body: + chunks = fetch_chunks_for_note(client, cfg, note_id) + body = _reconstruct_body_from_chunks(chunks) - print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"})) - total += 1 + fm = {} + for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"): + if k in n: + fm[k] = n[k] + for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"): + if k in n: + fm[k] = n[k] - print(f"Done. Exported notes: {total}") + edges_block = "" + if args.include_edges in ("yaml", "footer"): + try: + edges = fetch_edges_for_note(client, cfg, note_id) or [] + if args.include_edges == "yaml": + fm["_edges"] = edges + else: + edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n" + except Exception: + pass + if dst.exists() and not args.overwrite: + decision = "skip" + else: + _ensure_parent(dst) + content = _yaml_frontmatter(fm) + (body or "") + edges_block + dst.write_text(content, encoding="utf-8") + decision = "write" + + print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False)) + if decision == "write": + exported += 1 + + print(f"Done. Exported notes: {exported}") if __name__ == "__main__": main()