scripts/export_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-11-08 08:00:15 +01:00
parent c05dbd4b3b
commit b5958a9f63

View File

@ -1,227 +1,180 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: scripts/export_markdown.py Qdrant Markdown (Vault)
Version: 1.4.1
Datum: 2025-09-10
Modul: scripts/export_markdown.py
Version: 1.6.1
Datum: 2025-11-07
Funktion
--------
Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird
bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks
(Sortierung: seq chunk_index Nummer in chunk_id). Pfade werden **relativ** geschrieben.
Zweck
-----
Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm):
Pfade relativieren, Backslashes Slashes
Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index)
Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer)
Optionen
--------
--out PATH Zielordner (erforderlich)
--prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX)
--note-id ID Nur eine Note exportieren
--overwrite Existierende Dateien überschreiben
--include-edges MODE none|yaml|footer (Default: none)
--flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path
ENV
CLI
---
COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
Beispiele
---------
export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id <ID>
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml
python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer
# Nur eine Note, Edges als YAML-Feld 'references'
python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml
# Flach schreiben mit Überschreiben
python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite
Parameter
---------
--out Zielwurzel (Ordner wird angelegt)
--prefix überschreibt ENV COLLECTION_PREFIX (Default: mindnet)
--note-id nur eine bestimmte Note exportieren
--overwrite vorhandene Dateien überschreiben
--include-edges none|yaml|footer (Default: none)
"""
from __future__ import annotations
import argparse
import os
import json
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import yaml
from qdrant_client.http import models as rest
from app.core.qdrant import (
QdrantConfig,
get_client,
fetch_all_notes,
fetch_chunks_for_note,
fetch_edges_for_note, # <— jetzt angebunden
ensure_collections,
)
from app.core.qdrant import QdrantConfig, get_client
from app.core.qdrant import ensure_collections # safety
def _normalize_rel_path(p: str) -> str:
p = (p or "").replace("\\", "/")
while p.startswith("/"):
p = p[1:]
return p
# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------
def _ensure_parent(p: Path):
p.parent.mkdir(parents=True, exist_ok=True)
def collections(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _yaml_frontmatter(d: Dict) -> str:
import io
def _ser(obj):
if isinstance(obj, str):
if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]):
return '"' + obj.replace('"', '\\"') + '"'
return obj
if isinstance(obj, bool):
return "true" if obj else "false"
if obj is None:
return "null"
if isinstance(obj, (int, float)):
return str(obj)
if isinstance(obj, list):
return "[" + ", ".join(_ser(x) for x in obj) + "]"
if isinstance(obj, dict):
inner = []
for k in sorted(obj.keys()):
inner.append(f"{k}: {_ser(obj[k])}")
return "{ " + ", ".join(inner) + " }"
return '"' + str(obj).replace('"', '\\"') + '"'
def _norm_rel_path(path: str) -> str:
p = (path or "").replace("\\", "/").lstrip("/")
return p if p else ""
buf = io.StringIO()
buf.write("---\n")
for k in sorted(d.keys()):
buf.write(f"{k}: {_ser(d[k])}\n")
buf.write("---\n")
return buf.getvalue()
def _ensure_dir(path: str) -> None:
d = os.path.dirname(path)
if d and not os.path.exists(d):
os.makedirs(d, exist_ok=True)
def _yaml_dump(data: Dict[str, Any]) -> str:
return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip()
def _frontmatter_block(fm: Dict[str, Any]) -> str:
y = _yaml_dump(fm)
return f"---\n{y}\n---\n"
def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]:
out = []
nextp = None
while True:
pts, nextp = client.scroll(
collection_name=collection,
with_payload=True,
with_vectors=False,
limit=256,
scroll_filter=flt,
offset=nextp,
def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str:
if not chunks:
return ""
def _num_from_chunk_id(cid: str) -> int:
try:
if "#" in cid:
return int(cid.split("#", 1)[1])
return 0
except Exception:
return 0
chunks_sorted = sorted(
chunks,
key=lambda c: (
int(c.get("seq", c.get("chunk_index", 0))),
int(c.get("chunk_index", 0)),
_num_from_chunk_id(str(c.get("chunk_id", ""))),
)
if not pts:
break
out.extend(pts)
if nextp is None:
break
return out
)
body = "".join(c.get("text") or "" for c in chunks_sorted)
return body
def _reconstruct_body_from_chunks(chunks: List[Any]) -> str:
def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]:
s = pl.get("seq")
ci = pl.get("chunk_index")
cid = pl.get("chunk_id") or ""
n = 0
if isinstance(cid, str) and "#" in cid:
try:
n = int(cid.rsplit("#", 1)[-1])
except Exception:
n = 0
return (int(s) if isinstance(s, int) else 0,
int(ci) if isinstance(ci, int) else 0,
n)
chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {}))
texts: List[str] = []
for p in chunks_sorted:
pl = p.payload or {}
t = pl.get("text") or pl.get("content") or pl.get("raw") or ""
if isinstance(t, str) and t:
texts.append(t)
return "\n".join(texts).strip()
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.")
p.add_argument("--out", required=True, help="Zielordner")
p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX")
p.add_argument("--note-id", default="", help="nur eine Note exportieren")
p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben")
p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen")
return p.parse_args()
def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]:
refs = []
for p in edges:
pl = p.payload or {}
if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str):
refs.append(pl["target_id"])
# de-dupe, preserve order
seen = set()
out = []
for r in refs:
if r not in seen:
seen.add(r)
out.append(r)
return out
def main():
args = parse_args()
out_root = Path(args.out).resolve()
out_root.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren")
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none",
help="Forward-Links mit exportieren (aus Edges oder Note-Payload)")
ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix.strip()
prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet"
cfg = QdrantConfig.from_env(prefix=prefix)
client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim)
ensure_collections(client, cfg)
out_root = os.path.abspath(args.out)
os.makedirs(out_root, exist_ok=True)
notes_col, chunks_col, edges_col = collections(cfg.prefix)
# Filter nach note-id (optional)
flt = None
if args.note_id:
flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))])
notes = fetch_all_notes(client, cfg, only_note_id=args.note_id)
else:
notes = fetch_all_notes(client, cfg)
notes = _scroll_all(client, notes_col, flt)
total = 0
exported = 0
for n in notes:
pl = n.payload or {}
nid = pl.get("note_id")
rel_path = _norm_rel_path(pl.get("path") or "")
if args.flatten_paths or not rel_path:
fname = f"{(nid or 'note')}.md"
out_path = os.path.join(out_root, fname)
else:
out_path = os.path.join(out_root, rel_path)
out_path = out_path.replace("\\", "/")
_ensure_dir(out_path)
# Frontmatter aufbauen (nur sinnvolle Felder)
fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"]
fm: Dict[str, Any] = {}
fm["id"] = nid
for k in fm_fields:
if k == "id":
continue
if k in pl and pl[k] is not None:
fm[k] = pl[k]
if args.flatten_paths and rel_path:
fm["orig_path"] = rel_path
# Body ermitteln (fulltext oder Chunks)
body = (pl.get("fulltext") or "").strip()
if not body:
flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
chunks = _scroll_all(client, chunks_col, flt_chunks)
body = _reconstruct_body_from_chunks(chunks)
# Edges (optional)
refs: List[str] = []
if args.include_edges != "none":
# aus Note-Payload, falls vorhanden
if isinstance(pl.get("references"), list) and pl["references"]:
refs = [r for r in pl["references"] if isinstance(r, str)]
else:
flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
edges = _scroll_all(client, edges_col, flt_edges)
refs = _collect_forward_refs_from_edges(edges)
if args.include_edges == "yaml" and refs:
fm["references"] = refs
# Datei schreiben
if (not args.overwrite) and os.path.exists(out_path):
print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"}))
note_id = n.get("note_id") or n.get("id")
if not note_id:
continue
content = _frontmatter_block(fm) + (body + "\n" if body else "")
if args.include_edges == "footer" and refs:
content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n"
rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md"))
dst = out_root.joinpath(rel)
with open(out_path, "w", encoding="utf-8") as f:
f.write(content)
body = str(n.get("fulltext") or "")
if not body:
chunks = fetch_chunks_for_note(client, cfg, note_id)
body = _reconstruct_body_from_chunks(chunks)
print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"}))
total += 1
fm = {}
for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"):
if k in n:
fm[k] = n[k]
for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"):
if k in n:
fm[k] = n[k]
print(f"Done. Exported notes: {total}")
edges_block = ""
if args.include_edges in ("yaml", "footer"):
try:
edges = fetch_edges_for_note(client, cfg, note_id) or []
if args.include_edges == "yaml":
fm["_edges"] = edges
else:
edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n"
except Exception:
pass
if dst.exists() and not args.overwrite:
decision = "skip"
else:
_ensure_parent(dst)
content = _yaml_frontmatter(fm) + (body or "") + edges_block
dst.write_text(content, encoding="utf-8")
decision = "write"
print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False))
if decision == "write":
exported += 1
print(f"Done. Exported notes: {exported}")
if __name__ == "__main__":
main()