scripts/export_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-11-08 08:00:15 +01:00
parent c05dbd4b3b
commit b5958a9f63

View File

@ -1,227 +1,180 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: scripts/export_markdown.py Qdrant Markdown (Vault) Modul: scripts/export_markdown.py
Version: 1.4.1 Version: 1.6.1
Datum: 2025-09-10 Datum: 2025-11-07
Funktion Zweck
-------- -----
Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm):
bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks Pfade relativieren, Backslashes Slashes
(Sortierung: seq chunk_index Nummer in chunk_id). Pfade werden **relativ** geschrieben. Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index)
Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer)
Optionen CLI
--------
--out PATH Zielordner (erforderlich)
--prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX)
--note-id ID Nur eine Note exportieren
--overwrite Existierende Dateien überschreiben
--include-edges MODE none|yaml|footer (Default: none)
--flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path
ENV
--- ---
COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
Beispiele
---------
export COLLECTION_PREFIX="mindnet" export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id <ID>
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml
python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer
# Nur eine Note, Edges als YAML-Feld 'references' Parameter
python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml ---------
--out Zielwurzel (Ordner wird angelegt)
# Flach schreiben mit Überschreiben --prefix überschreibt ENV COLLECTION_PREFIX (Default: mindnet)
python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite --note-id nur eine bestimmte Note exportieren
--overwrite vorhandene Dateien überschreiben
--include-edges none|yaml|footer (Default: none)
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import os import os
import json import json
from typing import Dict, List, Optional, Tuple, Any from pathlib import Path
from typing import Dict, List, Tuple, Optional
import yaml from app.core.qdrant import (
from qdrant_client.http import models as rest QdrantConfig,
get_client,
fetch_all_notes,
fetch_chunks_for_note,
fetch_edges_for_note, # <— jetzt angebunden
ensure_collections,
)
from app.core.qdrant import QdrantConfig, get_client def _normalize_rel_path(p: str) -> str:
from app.core.qdrant import ensure_collections # safety p = (p or "").replace("\\", "/")
while p.startswith("/"):
p = p[1:]
return p
# --------------------------------------------------------------------- def _ensure_parent(p: Path):
# Helpers p.parent.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------
def collections(prefix: str) -> Tuple[str, str, str]: def _yaml_frontmatter(d: Dict) -> str:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" import io
def _ser(obj):
if isinstance(obj, str):
if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]):
return '"' + obj.replace('"', '\\"') + '"'
return obj
if isinstance(obj, bool):
return "true" if obj else "false"
if obj is None:
return "null"
if isinstance(obj, (int, float)):
return str(obj)
if isinstance(obj, list):
return "[" + ", ".join(_ser(x) for x in obj) + "]"
if isinstance(obj, dict):
inner = []
for k in sorted(obj.keys()):
inner.append(f"{k}: {_ser(obj[k])}")
return "{ " + ", ".join(inner) + " }"
return '"' + str(obj).replace('"', '\\"') + '"'
def _norm_rel_path(path: str) -> str: buf = io.StringIO()
p = (path or "").replace("\\", "/").lstrip("/") buf.write("---\n")
return p if p else "" for k in sorted(d.keys()):
buf.write(f"{k}: {_ser(d[k])}\n")
buf.write("---\n")
return buf.getvalue()
def _ensure_dir(path: str) -> None: def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str:
d = os.path.dirname(path) if not chunks:
if d and not os.path.exists(d): return ""
os.makedirs(d, exist_ok=True) def _num_from_chunk_id(cid: str) -> int:
try:
def _yaml_dump(data: Dict[str, Any]) -> str: if "#" in cid:
return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip() return int(cid.split("#", 1)[1])
return 0
def _frontmatter_block(fm: Dict[str, Any]) -> str: except Exception:
y = _yaml_dump(fm) return 0
return f"---\n{y}\n---\n" chunks_sorted = sorted(
chunks,
def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]: key=lambda c: (
out = [] int(c.get("seq", c.get("chunk_index", 0))),
nextp = None int(c.get("chunk_index", 0)),
while True: _num_from_chunk_id(str(c.get("chunk_id", ""))),
pts, nextp = client.scroll(
collection_name=collection,
with_payload=True,
with_vectors=False,
limit=256,
scroll_filter=flt,
offset=nextp,
) )
if not pts: )
break body = "".join(c.get("text") or "" for c in chunks_sorted)
out.extend(pts) return body
if nextp is None:
break
return out
def _reconstruct_body_from_chunks(chunks: List[Any]) -> str: def parse_args() -> argparse.Namespace:
def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]: p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.")
s = pl.get("seq") p.add_argument("--out", required=True, help="Zielordner")
ci = pl.get("chunk_index") p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX")
cid = pl.get("chunk_id") or "" p.add_argument("--note-id", default="", help="nur eine Note exportieren")
n = 0 p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben")
if isinstance(cid, str) and "#" in cid: p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen")
try: return p.parse_args()
n = int(cid.rsplit("#", 1)[-1])
except Exception:
n = 0
return (int(s) if isinstance(s, int) else 0,
int(ci) if isinstance(ci, int) else 0,
n)
chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {}))
texts: List[str] = []
for p in chunks_sorted:
pl = p.payload or {}
t = pl.get("text") or pl.get("content") or pl.get("raw") or ""
if isinstance(t, str) and t:
texts.append(t)
return "\n".join(texts).strip()
def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]: def main():
refs = [] args = parse_args()
for p in edges: out_root = Path(args.out).resolve()
pl = p.payload or {} out_root.mkdir(parents=True, exist_ok=True)
if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str):
refs.append(pl["target_id"])
# de-dupe, preserve order
seen = set()
out = []
for r in refs:
if r not in seen:
seen.add(r)
out.append(r)
return out
# --------------------------------------------------------------------- prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet"
# Main cfg = QdrantConfig.from_env(prefix=prefix)
# ---------------------------------------------------------------------
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren")
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none",
help="Forward-Links mit exportieren (aus Edges oder Note-Payload)")
ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix.strip()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg)
out_root = os.path.abspath(args.out)
os.makedirs(out_root, exist_ok=True)
notes_col, chunks_col, edges_col = collections(cfg.prefix)
# Filter nach note-id (optional)
flt = None
if args.note_id: if args.note_id:
flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))]) notes = fetch_all_notes(client, cfg, only_note_id=args.note_id)
else:
notes = fetch_all_notes(client, cfg)
notes = _scroll_all(client, notes_col, flt) exported = 0
total = 0
for n in notes: for n in notes:
pl = n.payload or {} note_id = n.get("note_id") or n.get("id")
nid = pl.get("note_id") if not note_id:
rel_path = _norm_rel_path(pl.get("path") or "")
if args.flatten_paths or not rel_path:
fname = f"{(nid or 'note')}.md"
out_path = os.path.join(out_root, fname)
else:
out_path = os.path.join(out_root, rel_path)
out_path = out_path.replace("\\", "/")
_ensure_dir(out_path)
# Frontmatter aufbauen (nur sinnvolle Felder)
fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"]
fm: Dict[str, Any] = {}
fm["id"] = nid
for k in fm_fields:
if k == "id":
continue
if k in pl and pl[k] is not None:
fm[k] = pl[k]
if args.flatten_paths and rel_path:
fm["orig_path"] = rel_path
# Body ermitteln (fulltext oder Chunks)
body = (pl.get("fulltext") or "").strip()
if not body:
flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
chunks = _scroll_all(client, chunks_col, flt_chunks)
body = _reconstruct_body_from_chunks(chunks)
# Edges (optional)
refs: List[str] = []
if args.include_edges != "none":
# aus Note-Payload, falls vorhanden
if isinstance(pl.get("references"), list) and pl["references"]:
refs = [r for r in pl["references"] if isinstance(r, str)]
else:
flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
edges = _scroll_all(client, edges_col, flt_edges)
refs = _collect_forward_refs_from_edges(edges)
if args.include_edges == "yaml" and refs:
fm["references"] = refs
# Datei schreiben
if (not args.overwrite) and os.path.exists(out_path):
print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"}))
continue continue
content = _frontmatter_block(fm) + (body + "\n" if body else "") rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md"))
if args.include_edges == "footer" and refs: dst = out_root.joinpath(rel)
content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n"
with open(out_path, "w", encoding="utf-8") as f: body = str(n.get("fulltext") or "")
f.write(content) if not body:
chunks = fetch_chunks_for_note(client, cfg, note_id)
body = _reconstruct_body_from_chunks(chunks)
print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"})) fm = {}
total += 1 for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"):
if k in n:
fm[k] = n[k]
for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"):
if k in n:
fm[k] = n[k]
print(f"Done. Exported notes: {total}") edges_block = ""
if args.include_edges in ("yaml", "footer"):
try:
edges = fetch_edges_for_note(client, cfg, note_id) or []
if args.include_edges == "yaml":
fm["_edges"] = edges
else:
edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n"
except Exception:
pass
if dst.exists() and not args.overwrite:
decision = "skip"
else:
_ensure_parent(dst)
content = _yaml_frontmatter(fm) + (body or "") + edges_block
dst.write_text(content, encoding="utf-8")
decision = "write"
print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False))
if decision == "write":
exported += 1
print(f"Done. Exported notes: {exported}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()