scripts/export_markdown.py aktualisiert

2025-11-08 08:00:15 +01:00 · 2025-11-08 08:00:15 +01:00 · b5958a9f63
commit b5958a9f63
parent c05dbd4b3b
1 changed files with 141 additions and 188 deletions
--- a/scripts/export_markdown.py
+++ b/scripts/export_markdown.py
@ -1,227 +1,180 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Script: scripts/export_markdown.py — Qdrant → Markdown (Vault)
-Version: 1.4.1
-Datum: 2025-09-10
+Modul: scripts/export_markdown.py
+Version: 1.6.1
+Datum: 2025-11-07

-Funktion
--------
-Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird
-bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks
-(Sortierung: seq → chunk_index → Nummer in chunk_id). Pfade werden **relativ** geschrieben.
+Zweck
+-----
+Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm):
+  • Pfade relativieren, Backslashes → Slashes
+  • Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index)
+  • Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer)

-Optionen
--------
--out PATH             Zielordner (erforderlich)
--prefix TEXT          Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX)
--note-id ID           Nur eine Note exportieren
--overwrite            Existierende Dateien überschreiben
--include-edges MODE   none|yaml|footer (Default: none)
--flatten-paths        Alle Dateien flach schreiben; Originalpfad in FM: orig_path
-
-ENV
+CLI
 ---
-COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
-
-Beispiele
---------
    export COLLECTION_PREFIX="mindnet"
    python3 -m scripts.export_markdown --out ./_exportVault
+    python3 -m scripts.export_markdown --out ./_exportVault --note-id <ID>
+    python3 -m scripts.export_markdown --out ./_exportVault --overwrite
+    python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml
+    python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer

-    # Nur eine Note, Edges als YAML-Feld 'references'
-    python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml
-
-    # Flach schreiben mit Überschreiben
-    python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite
+Parameter
+---------
+--out            Zielwurzel (Ordner wird angelegt)
+--prefix         überschreibt ENV COLLECTION_PREFIX (Default: mindnet)
+--note-id        nur eine bestimmte Note exportieren
+--overwrite      vorhandene Dateien überschreiben
+--include-edges  none|yaml|footer  (Default: none)
 """
 from __future__ import annotations

 import argparse
 import os
 import json
-from typing import Dict, List, Optional, Tuple, Any
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional

-import yaml
-from qdrant_client.http import models as rest
+from app.core.qdrant import (
+    QdrantConfig,
+    get_client,
+    fetch_all_notes,
+    fetch_chunks_for_note,
+    fetch_edges_for_note,   # <— jetzt angebunden
+    ensure_collections,
+)

-from app.core.qdrant import QdrantConfig, get_client
-from app.core.qdrant import ensure_collections  # safety
+def _normalize_rel_path(p: str) -> str:
+    p = (p or "").replace("\\", "/")
+    while p.startswith("/"):
+        p = p[1:]
+    return p

-# ---------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------
+def _ensure_parent(p: Path):
+    p.parent.mkdir(parents=True, exist_ok=True)

-def collections(prefix: str) -> Tuple[str, str, str]:
-    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
+def _yaml_frontmatter(d: Dict) -> str:
+    import io
+    def _ser(obj):
+        if isinstance(obj, str):
+            if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]):
+                return '"' + obj.replace('"', '\\"') + '"'
+            return obj
+        if isinstance(obj, bool):
+            return "true" if obj else "false"
+        if obj is None:
+            return "null"
+        if isinstance(obj, (int, float)):
+            return str(obj)
+        if isinstance(obj, list):
+            return "[" + ", ".join(_ser(x) for x in obj) + "]"
+        if isinstance(obj, dict):
+            inner = []
+            for k in sorted(obj.keys()):
+                inner.append(f"{k}: {_ser(obj[k])}")
+            return "{ " + ", ".join(inner) + " }"
+        return '"' + str(obj).replace('"', '\\"') + '"'

-def _norm_rel_path(path: str) -> str:
-    p = (path or "").replace("\\", "/").lstrip("/")
-    return p if p else ""
+    buf = io.StringIO()
+    buf.write("---\n")
+    for k in sorted(d.keys()):
+        buf.write(f"{k}: {_ser(d[k])}\n")
+    buf.write("---\n")
+    return buf.getvalue()

-def _ensure_dir(path: str) -> None:
-    d = os.path.dirname(path)
-    if d and not os.path.exists(d):
-        os.makedirs(d, exist_ok=True)
-
-def _yaml_dump(data: Dict[str, Any]) -> str:
-    return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip()
-
-def _frontmatter_block(fm: Dict[str, Any]) -> str:
-    y = _yaml_dump(fm)
-    return f"---\n{y}\n---\n"
-
-def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]:
-    out = []
-    nextp = None
-    while True:
-        pts, nextp = client.scroll(
-            collection_name=collection,
-            with_payload=True,
-            with_vectors=False,
-            limit=256,
-            scroll_filter=flt,
-            offset=nextp,
+def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str:
+    if not chunks:
+        return ""
+    def _num_from_chunk_id(cid: str) -> int:
+        try:
+            if "#" in cid:
+                return int(cid.split("#", 1)[1])
+            return 0
+        except Exception:
+            return 0
+    chunks_sorted = sorted(
+        chunks,
+        key=lambda c: (
+            int(c.get("seq", c.get("chunk_index", 0))),
+            int(c.get("chunk_index", 0)),
+            _num_from_chunk_id(str(c.get("chunk_id", ""))),
        )
-        if not pts:
-            break
-        out.extend(pts)
-        if nextp is None:
-            break
-    return out
+    )
+    body = "".join(c.get("text") or "" for c in chunks_sorted)
+    return body

-def _reconstruct_body_from_chunks(chunks: List[Any]) -> str:
-    def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]:
-        s = pl.get("seq")
-        ci = pl.get("chunk_index")
-        cid = pl.get("chunk_id") or ""
-        n = 0
-        if isinstance(cid, str) and "#" in cid:
-            try:
-                n = int(cid.rsplit("#", 1)[-1])
-            except Exception:
-                n = 0
-        return (int(s) if isinstance(s, int) else 0,
-                int(ci) if isinstance(ci, int) else 0,
-                n)
-    chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {}))
-    texts: List[str] = []
-    for p in chunks_sorted:
-        pl = p.payload or {}
-        t = pl.get("text") or pl.get("content") or pl.get("raw") or ""
-        if isinstance(t, str) and t:
-            texts.append(t)
-    return "\n".join(texts).strip()
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.")
+    p.add_argument("--out", required=True, help="Zielordner")
+    p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX")
+    p.add_argument("--note-id", default="", help="nur eine Note exportieren")
+    p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben")
+    p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen")
+    return p.parse_args()

-def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]:
-    refs = []
-    for p in edges:
-        pl = p.payload or {}
-        if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str):
-            refs.append(pl["target_id"])
-    # de-dupe, preserve order
-    seen = set()
-    out = []
-    for r in refs:
-        if r not in seen:
-            seen.add(r)
-            out.append(r)
-    return out
+def main():
+    args = parse_args()
+    out_root = Path(args.out).resolve()
+    out_root.mkdir(parents=True, exist_ok=True)

-# ---------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------
-
-def main() -> None:
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien")
-    ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
-    ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren")
-    ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
-    ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none",
-                    help="Forward-Links mit exportieren (aus Edges oder Note-Payload)")
-    ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)")
-    args = ap.parse_args()
-
-    cfg = QdrantConfig.from_env()
-    if args.prefix:
-        cfg.prefix = args.prefix.strip()
+    prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet"
+    cfg = QdrantConfig.from_env(prefix=prefix)
    client = get_client(cfg)
-    ensure_collections(client, cfg.prefix, cfg.dim)
+    ensure_collections(client, cfg)

-    out_root = os.path.abspath(args.out)
-    os.makedirs(out_root, exist_ok=True)
-
-    notes_col, chunks_col, edges_col = collections(cfg.prefix)
-
-    # Filter nach note-id (optional)
-    flt = None
    if args.note_id:
-        flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))])
+        notes = fetch_all_notes(client, cfg, only_note_id=args.note_id)
+    else:
+        notes = fetch_all_notes(client, cfg)

-    notes = _scroll_all(client, notes_col, flt)
-    total = 0
+    exported = 0
    for n in notes:
-        pl = n.payload or {}
-        nid = pl.get("note_id")
-        rel_path = _norm_rel_path(pl.get("path") or "")
-        if args.flatten_paths or not rel_path:
-            fname = f"{(nid or 'note')}.md"
-            out_path = os.path.join(out_root, fname)
-        else:
-            out_path = os.path.join(out_root, rel_path)
-        out_path = out_path.replace("\\", "/")
-        _ensure_dir(out_path)
-
-        # Frontmatter aufbauen (nur sinnvolle Felder)
-        fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"]
-        fm: Dict[str, Any] = {}
-        fm["id"] = nid
-        for k in fm_fields:
-            if k == "id":
-                continue
-            if k in pl and pl[k] is not None:
-                fm[k] = pl[k]
-        if args.flatten_paths and rel_path:
-            fm["orig_path"] = rel_path
-
-        # Body ermitteln (fulltext oder Chunks)
-        body = (pl.get("fulltext") or "").strip()
-        if not body:
-            flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
-            chunks = _scroll_all(client, chunks_col, flt_chunks)
-            body = _reconstruct_body_from_chunks(chunks)
-
-        # Edges (optional)
-        refs: List[str] = []
-        if args.include_edges != "none":
-            # aus Note-Payload, falls vorhanden
-            if isinstance(pl.get("references"), list) and pl["references"]:
-                refs = [r for r in pl["references"] if isinstance(r, str)]
-            else:
-                flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
-                edges = _scroll_all(client, edges_col, flt_edges)
-                refs = _collect_forward_refs_from_edges(edges)
-            if args.include_edges == "yaml" and refs:
-                fm["references"] = refs
-
-        # Datei schreiben
-        if (not args.overwrite) and os.path.exists(out_path):
-            print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"}))
+        note_id = n.get("note_id") or n.get("id")
+        if not note_id:
            continue

-        content = _frontmatter_block(fm) + (body + "\n" if body else "")
-        if args.include_edges == "footer" and refs:
-            content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n"
+        rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md"))
+        dst = out_root.joinpath(rel)

-        with open(out_path, "w", encoding="utf-8") as f:
-            f.write(content)
+        body = str(n.get("fulltext") or "")
+        if not body:
+            chunks = fetch_chunks_for_note(client, cfg, note_id)
+            body = _reconstruct_body_from_chunks(chunks)

-        print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"}))
-        total += 1
+        fm = {}
+        for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"):
+            if k in n:
+                fm[k] = n[k]
+        for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"):
+            if k in n:
+                fm[k] = n[k]

-    print(f"Done. Exported notes: {total}")
+        edges_block = ""
+        if args.include_edges in ("yaml", "footer"):
+            try:
+                edges = fetch_edges_for_note(client, cfg, note_id) or []
+                if args.include_edges == "yaml":
+                    fm["_edges"] = edges
+                else:
+                    edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n"
+            except Exception:
+                pass

+        if dst.exists() and not args.overwrite:
+            decision = "skip"
+        else:
+            _ensure_parent(dst)
+            content = _yaml_frontmatter(fm) + (body or "") + edges_block
+            dst.write_text(content, encoding="utf-8")
+            decision = "write"
+
+        print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False))
+        if decision == "write":
+            exported += 1
+
+    print(f"Done. Exported notes: {exported}")

 if __name__ == "__main__":
    main()