scripts/export_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-11-08 08:00:15 +01:00
parent c05dbd4b3b
commit b5958a9f63

View File

@ -1,227 +1,180 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: scripts/export_markdown.py Qdrant Markdown (Vault) Modul: scripts/export_markdown.py
Version: 1.4.1 Version: 1.6.1
Datum: 2025-09-10 Datum: 2025-11-07
Funktion Zweck
-------- -----
Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm):
bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks Pfade relativieren, Backslashes Slashes
(Sortierung: seq chunk_index Nummer in chunk_id). Pfade werden **relativ** geschrieben. Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index)
Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer)
Optionen CLI
--------
--out PATH Zielordner (erforderlich)
--prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX)
--note-id ID Nur eine Note exportieren
--overwrite Existierende Dateien überschreiben
--include-edges MODE none|yaml|footer (Default: none)
--flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path
ENV
--- ---
COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
Beispiele
---------
export COLLECTION_PREFIX="mindnet" export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id <ID>
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml
python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer
# Nur eine Note, Edges als YAML-Feld 'references' Parameter
python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml ---------
--out Zielwurzel (Ordner wird angelegt)
# Flach schreiben mit Überschreiben --prefix überschreibt ENV COLLECTION_PREFIX (Default: mindnet)
python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite --note-id nur eine bestimmte Note exportieren
--overwrite vorhandene Dateien überschreiben
--include-edges none|yaml|footer (Default: none)
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import os import os
import json import json
from typing import Dict, List, Optional, Tuple, Any from pathlib import Path
from typing import Dict, List, Tuple, Optional
import yaml from app.core.qdrant import (
from qdrant_client.http import models as rest QdrantConfig,
get_client,
from app.core.qdrant import QdrantConfig, get_client fetch_all_notes,
from app.core.qdrant import ensure_collections # safety fetch_chunks_for_note,
fetch_edges_for_note, # <— jetzt angebunden
# --------------------------------------------------------------------- ensure_collections,
# Helpers
# ---------------------------------------------------------------------
def collections(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _norm_rel_path(path: str) -> str:
p = (path or "").replace("\\", "/").lstrip("/")
return p if p else ""
def _ensure_dir(path: str) -> None:
d = os.path.dirname(path)
if d and not os.path.exists(d):
os.makedirs(d, exist_ok=True)
def _yaml_dump(data: Dict[str, Any]) -> str:
return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip()
def _frontmatter_block(fm: Dict[str, Any]) -> str:
y = _yaml_dump(fm)
return f"---\n{y}\n---\n"
def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]:
out = []
nextp = None
while True:
pts, nextp = client.scroll(
collection_name=collection,
with_payload=True,
with_vectors=False,
limit=256,
scroll_filter=flt,
offset=nextp,
) )
if not pts:
break
out.extend(pts)
if nextp is None:
break
return out
def _reconstruct_body_from_chunks(chunks: List[Any]) -> str: def _normalize_rel_path(p: str) -> str:
def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]: p = (p or "").replace("\\", "/")
s = pl.get("seq") while p.startswith("/"):
ci = pl.get("chunk_index") p = p[1:]
cid = pl.get("chunk_id") or "" return p
n = 0
if isinstance(cid, str) and "#" in cid: def _ensure_parent(p: Path):
p.parent.mkdir(parents=True, exist_ok=True)
def _yaml_frontmatter(d: Dict) -> str:
import io
def _ser(obj):
if isinstance(obj, str):
if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]):
return '"' + obj.replace('"', '\\"') + '"'
return obj
if isinstance(obj, bool):
return "true" if obj else "false"
if obj is None:
return "null"
if isinstance(obj, (int, float)):
return str(obj)
if isinstance(obj, list):
return "[" + ", ".join(_ser(x) for x in obj) + "]"
if isinstance(obj, dict):
inner = []
for k in sorted(obj.keys()):
inner.append(f"{k}: {_ser(obj[k])}")
return "{ " + ", ".join(inner) + " }"
return '"' + str(obj).replace('"', '\\"') + '"'
buf = io.StringIO()
buf.write("---\n")
for k in sorted(d.keys()):
buf.write(f"{k}: {_ser(d[k])}\n")
buf.write("---\n")
return buf.getvalue()
def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str:
if not chunks:
return ""
def _num_from_chunk_id(cid: str) -> int:
try: try:
n = int(cid.rsplit("#", 1)[-1]) if "#" in cid:
return int(cid.split("#", 1)[1])
return 0
except Exception: except Exception:
n = 0 return 0
return (int(s) if isinstance(s, int) else 0, chunks_sorted = sorted(
int(ci) if isinstance(ci, int) else 0, chunks,
n) key=lambda c: (
chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {})) int(c.get("seq", c.get("chunk_index", 0))),
texts: List[str] = [] int(c.get("chunk_index", 0)),
for p in chunks_sorted: _num_from_chunk_id(str(c.get("chunk_id", ""))),
pl = p.payload or {} )
t = pl.get("text") or pl.get("content") or pl.get("raw") or "" )
if isinstance(t, str) and t: body = "".join(c.get("text") or "" for c in chunks_sorted)
texts.append(t) return body
return "\n".join(texts).strip()
def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]: def parse_args() -> argparse.Namespace:
refs = [] p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.")
for p in edges: p.add_argument("--out", required=True, help="Zielordner")
pl = p.payload or {} p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX")
if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str): p.add_argument("--note-id", default="", help="nur eine Note exportieren")
refs.append(pl["target_id"]) p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben")
# de-dupe, preserve order p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen")
seen = set() return p.parse_args()
out = []
for r in refs:
if r not in seen:
seen.add(r)
out.append(r)
return out
# --------------------------------------------------------------------- def main():
# Main args = parse_args()
# --------------------------------------------------------------------- out_root = Path(args.out).resolve()
out_root.mkdir(parents=True, exist_ok=True)
def main() -> None: prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet"
ap = argparse.ArgumentParser() cfg = QdrantConfig.from_env(prefix=prefix)
ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren")
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none",
help="Forward-Links mit exportieren (aus Edges oder Note-Payload)")
ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix.strip()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg)
out_root = os.path.abspath(args.out)
os.makedirs(out_root, exist_ok=True)
notes_col, chunks_col, edges_col = collections(cfg.prefix)
# Filter nach note-id (optional)
flt = None
if args.note_id: if args.note_id:
flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))]) notes = fetch_all_notes(client, cfg, only_note_id=args.note_id)
notes = _scroll_all(client, notes_col, flt)
total = 0
for n in notes:
pl = n.payload or {}
nid = pl.get("note_id")
rel_path = _norm_rel_path(pl.get("path") or "")
if args.flatten_paths or not rel_path:
fname = f"{(nid or 'note')}.md"
out_path = os.path.join(out_root, fname)
else: else:
out_path = os.path.join(out_root, rel_path) notes = fetch_all_notes(client, cfg)
out_path = out_path.replace("\\", "/")
_ensure_dir(out_path)
# Frontmatter aufbauen (nur sinnvolle Felder) exported = 0
fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"] for n in notes:
fm: Dict[str, Any] = {} note_id = n.get("note_id") or n.get("id")
fm["id"] = nid if not note_id:
for k in fm_fields:
if k == "id":
continue continue
if k in pl and pl[k] is not None:
fm[k] = pl[k]
if args.flatten_paths and rel_path:
fm["orig_path"] = rel_path
# Body ermitteln (fulltext oder Chunks) rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md"))
body = (pl.get("fulltext") or "").strip() dst = out_root.joinpath(rel)
body = str(n.get("fulltext") or "")
if not body: if not body:
flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) chunks = fetch_chunks_for_note(client, cfg, note_id)
chunks = _scroll_all(client, chunks_col, flt_chunks)
body = _reconstruct_body_from_chunks(chunks) body = _reconstruct_body_from_chunks(chunks)
# Edges (optional) fm = {}
refs: List[str] = [] for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"):
if args.include_edges != "none": if k in n:
# aus Note-Payload, falls vorhanden fm[k] = n[k]
if isinstance(pl.get("references"), list) and pl["references"]: for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"):
refs = [r for r in pl["references"] if isinstance(r, str)] if k in n:
fm[k] = n[k]
edges_block = ""
if args.include_edges in ("yaml", "footer"):
try:
edges = fetch_edges_for_note(client, cfg, note_id) or []
if args.include_edges == "yaml":
fm["_edges"] = edges
else: else:
flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n"
edges = _scroll_all(client, edges_col, flt_edges) except Exception:
refs = _collect_forward_refs_from_edges(edges) pass
if args.include_edges == "yaml" and refs:
fm["references"] = refs
# Datei schreiben if dst.exists() and not args.overwrite:
if (not args.overwrite) and os.path.exists(out_path): decision = "skip"
print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"})) else:
continue _ensure_parent(dst)
content = _yaml_frontmatter(fm) + (body or "") + edges_block
dst.write_text(content, encoding="utf-8")
decision = "write"
content = _frontmatter_block(fm) + (body + "\n" if body else "") print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False))
if args.include_edges == "footer" and refs: if decision == "write":
content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n" exported += 1
with open(out_path, "w", encoding="utf-8") as f:
f.write(content)
print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"}))
total += 1
print(f"Done. Exported notes: {total}")
print(f"Done. Exported notes: {exported}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()