mindnet/scripts/export_markdown.py
Lars b5958a9f63
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
scripts/export_markdown.py aktualisiert
2025-11-08 08:00:15 +01:00

181 lines
6.1 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Modul: scripts/export_markdown.py
Version: 1.6.1
Datum: 2025-11-07
Zweck
-----
Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm):
• Pfade relativieren, Backslashes → Slashes
• Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index)
• Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer)
CLI
---
export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id <ID>
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml
python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer
Parameter
---------
--out Zielwurzel (Ordner wird angelegt)
--prefix überschreibt ENV COLLECTION_PREFIX (Default: mindnet)
--note-id nur eine bestimmte Note exportieren
--overwrite vorhandene Dateien überschreiben
--include-edges none|yaml|footer (Default: none)
"""
from __future__ import annotations
import argparse
import os
import json
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from app.core.qdrant import (
QdrantConfig,
get_client,
fetch_all_notes,
fetch_chunks_for_note,
fetch_edges_for_note, # <— jetzt angebunden
ensure_collections,
)
def _normalize_rel_path(p: str) -> str:
p = (p or "").replace("\\", "/")
while p.startswith("/"):
p = p[1:]
return p
def _ensure_parent(p: Path):
p.parent.mkdir(parents=True, exist_ok=True)
def _yaml_frontmatter(d: Dict) -> str:
import io
def _ser(obj):
if isinstance(obj, str):
if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]):
return '"' + obj.replace('"', '\\"') + '"'
return obj
if isinstance(obj, bool):
return "true" if obj else "false"
if obj is None:
return "null"
if isinstance(obj, (int, float)):
return str(obj)
if isinstance(obj, list):
return "[" + ", ".join(_ser(x) for x in obj) + "]"
if isinstance(obj, dict):
inner = []
for k in sorted(obj.keys()):
inner.append(f"{k}: {_ser(obj[k])}")
return "{ " + ", ".join(inner) + " }"
return '"' + str(obj).replace('"', '\\"') + '"'
buf = io.StringIO()
buf.write("---\n")
for k in sorted(d.keys()):
buf.write(f"{k}: {_ser(d[k])}\n")
buf.write("---\n")
return buf.getvalue()
def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str:
if not chunks:
return ""
def _num_from_chunk_id(cid: str) -> int:
try:
if "#" in cid:
return int(cid.split("#", 1)[1])
return 0
except Exception:
return 0
chunks_sorted = sorted(
chunks,
key=lambda c: (
int(c.get("seq", c.get("chunk_index", 0))),
int(c.get("chunk_index", 0)),
_num_from_chunk_id(str(c.get("chunk_id", ""))),
)
)
body = "".join(c.get("text") or "" for c in chunks_sorted)
return body
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.")
p.add_argument("--out", required=True, help="Zielordner")
p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX")
p.add_argument("--note-id", default="", help="nur eine Note exportieren")
p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben")
p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen")
return p.parse_args()
def main():
args = parse_args()
out_root = Path(args.out).resolve()
out_root.mkdir(parents=True, exist_ok=True)
prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet"
cfg = QdrantConfig.from_env(prefix=prefix)
client = get_client(cfg)
ensure_collections(client, cfg)
if args.note_id:
notes = fetch_all_notes(client, cfg, only_note_id=args.note_id)
else:
notes = fetch_all_notes(client, cfg)
exported = 0
for n in notes:
note_id = n.get("note_id") or n.get("id")
if not note_id:
continue
rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md"))
dst = out_root.joinpath(rel)
body = str(n.get("fulltext") or "")
if not body:
chunks = fetch_chunks_for_note(client, cfg, note_id)
body = _reconstruct_body_from_chunks(chunks)
fm = {}
for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"):
if k in n:
fm[k] = n[k]
for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"):
if k in n:
fm[k] = n[k]
edges_block = ""
if args.include_edges in ("yaml", "footer"):
try:
edges = fetch_edges_for_note(client, cfg, note_id) or []
if args.include_edges == "yaml":
fm["_edges"] = edges
else:
edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n"
except Exception:
pass
if dst.exists() and not args.overwrite:
decision = "skip"
else:
_ensure_parent(dst)
content = _yaml_frontmatter(fm) + (body or "") + edges_block
dst.write_text(content, encoding="utf-8")
decision = "write"
print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False))
if decision == "write":
exported += 1
print(f"Done. Exported notes: {exported}")
if __name__ == "__main__":
main()