From ada7c1797def10e121d0e7f96c9726cbd341e07a Mon Sep 17 00:00:00 2001 From: Lars Date: Wed, 24 Sep 2025 12:15:34 +0200 Subject: [PATCH] scripts/export_markdown.py aktualisiert --- scripts/export_markdown.py | 351 +++++++++++++++++-------------------- 1 file changed, 164 insertions(+), 187 deletions(-) diff --git a/scripts/export_markdown.py b/scripts/export_markdown.py index 785a903..79a3584 100644 --- a/scripts/export_markdown.py +++ b/scripts/export_markdown.py @@ -1,249 +1,226 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Script: export_markdown.py — Qdrant → Markdown (Obsidian-kompatibel) +Script: scripts/export_markdown.py — Qdrant → Markdown (Vault) Version: 1.4.0 -Datum: 2025-09-09 +Datum: 2025-09-10 -Kurzbeschreibung ----------------- -Exportiert Markdown-Notizen aus Qdrant in einen Zielordner (Vault). -Rekonstruiert YAML-Frontmatter und Body. +Funktion +-------- +Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird +bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks +(Sortierung: seq → chunk_index → Nummer in chunk_id). Pfade werden **relativ** geschrieben. -Body-Rekonstruktions-Priorität (abwärtskompatibel): - 1) notes.payload.fulltext (verlustfrei, wenn beim Import gespeichert) - 2) ansonsten aus allen zugehörigen Chunks: payload.text → payload.content → payload.raw - in stabiler, sequentieller Reihenfolge (seq/chunk_index/ID-Nummer) +Optionen +-------- +--out PATH Zielordner (erforderlich) +--prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX) +--note-id ID Nur eine Note exportieren +--overwrite Existierende Dateien überschreiben +--include-edges MODE none|yaml|footer (Default: none) +--flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path -Wichtige Fixes --------------- -- **Pfad-Normalisierung**: erzwingt relative Pfade (führe führende '/' ab, backslashes → slashes), - damit ``--out`` nicht ignoriert wird. -- **`--prefix` (optional)**: Überschreibt COLLECTION_PREFIX; ENV bleibt Default (rückwärtskompatibel). - -ENV / Qdrant ------------- -- QDRANT_URL (oder QDRANT_HOST/QDRANT_PORT) -- QDRANT_API_KEY (optional) -- COLLECTION_PREFIX (Default: mindnet) - -Aufruf ------- - python3 -m scripts.export_markdown --out ./_exportVault - python3 -m scripts.export_markdown --out ./_exportVault --note-id 20250821-foo - python3 -m scripts.export_markdown --out ./_exportVault --overwrite - python3 -m scripts.export_markdown --out ./_exportVault --prefix mindnet_dev # optional +ENV +--- +COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY Beispiele --------- -COLLECTION_PREFIX=mindnet QDRANT_URL=http://127.0.0.1:6333 \\ - python3 -m scripts.export_markdown --out ./_exportVault --overwrite -""" + export COLLECTION_PREFIX="mindnet" + python3 -m scripts.export_markdown --out ./_exportVault + # Nur eine Note, Edges als YAML-Feld 'references' + python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml + + # Flach schreiben mit Überschreiben + python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite +""" from __future__ import annotations import argparse -import json import os -import re -from typing import Dict, Iterable, List, Optional, Tuple +import json +from typing import Dict, List, Optional, Tuple, Any import yaml from qdrant_client.http import models as rest -from qdrant_client import QdrantClient -from app.core.qdrant import QdrantConfig, get_client, ensure_collections +from app.core.qdrant import QdrantConfig, get_client +from app.core.qdrant import ensure_collections # safety +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- -# ----------------------------------------------------------------------------- -# Utilities -# ----------------------------------------------------------------------------- - -def _names(prefix: str) -> Tuple[str, str, str]: +def collections(prefix: str) -> Tuple[str, str, str]: return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" +def _norm_rel_path(path: str) -> str: + p = (path or "").replace("\\", "/").lstrip("/") + return p if p else "" def _ensure_dir(path: str) -> None: d = os.path.dirname(path) - if d and not os.path.isdir(d): + if d and not os.path.exists(d): os.makedirs(d, exist_ok=True) +def _yaml_dump(data: Dict[str, Any]) -> str: + return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip() -def _normalize_rel_path(p: str) -> str: - """Pfad relativ halten & normalisieren (slashes, führende / entfernen).""" - p = (p or "").replace("\\", "/") - return p.lstrip("/") +def _frontmatter_block(fm: Dict[str, Any]) -> str: + y = _yaml_dump(fm) + return f"---\n{y}\n---\n" - -def _to_md(frontmatter: dict, body: str) -> str: - fm = yaml.safe_dump(frontmatter, sort_keys=False, allow_unicode=True).strip() - return f"---\n{fm}\n---\n{(body or '').rstrip()}\n" - - -def _scroll_all( - client: QdrantClient, - col: str, - flt: Optional[rest.Filter] = None, - with_payload: bool = True, - with_vectors: bool = False, - limit: int = 256, -): - """Scrollt durch alle Punkte einer Collection und liefert eine Liste mit Points.""" +def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]: out = [] - next_page = None + nextp = None while True: - pts, next_page = client.scroll( - collection_name=col, + pts, nextp = client.scroll( + collection_name=collection, + with_payload=True, + with_vectors=False, + limit=256, scroll_filter=flt, - with_payload=with_payload, - with_vectors=with_vectors, - limit=limit, - offset=next_page, + offset=nextp, ) if not pts: break out.extend(pts) - if not next_page: + if nextp is None: break return out +def _reconstruct_body_from_chunks(chunks: List[Any]) -> str: + def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]: + s = pl.get("seq") + ci = pl.get("chunk_index") + cid = pl.get("chunk_id") or "" + n = 0 + if isinstance(cid, str) and "#" in cid: + try: + n = int(cid.rsplit("#", 1)[-1]) + except Exception: + n = 0 + return (int(s) if isinstance(s, int) else 0, + int(ci) if isinstance(ci, int) else 0, + n) + chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {})) + texts: List[str] = [] + for p in chunks_sorted: + pl = p.payload or {} + t = pl.get("text") or pl.get("content") or pl.get("raw") or "" + if isinstance(t, str) and t: + texts.append(t) + return "\n".join(texts).strip() -def _load_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[dict]: - flt = rest.Filter(must=[rest.FieldCondition( - key="note_id", - match=rest.MatchValue(value=note_id), - )]) - pts = _scroll_all(client, chunks_col, flt, with_payload=True, with_vectors=False) - # Sortierung: bevorzugt seq → chunk_index → Nummer in id - def _seq(pl: dict) -> Tuple[int, int, int]: - s1 = pl.get("seq", pl.get("chunk_index", -1)) - s2 = pl.get("chunk_index", -1) - # Nummer-Anteil aus "noteid#" - s3 = 0 - try: - m = re.search(r"#(\\d+)$", pl.get("id") or "") - if m: - s3 = int(m.group(1)) - except Exception: - pass - return (int(s1) if isinstance(s1, int) else -1, int(s2) if isinstance(s2, int) else -1, s3) +def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]: + refs = [] + for p in edges: + pl = p.payload or {} + if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str): + refs.append(pl["target_id"]) + # de-dupe, preserve order + seen = set() + out = [] + for r in refs: + if r not in seen: + seen.add(r) + out.append(r) + return out - pts_sorted = sorted(pts, key=lambda p: _seq(p.payload or {})) - return [p.payload or {} for p in pts_sorted] - - -def _reconstruct_body(note_pl: dict, chunk_payloads: List[dict]) -> str: - # 1) Volltext vorhanden? - fulltext = note_pl.get("fulltext") - if isinstance(fulltext, str) and fulltext.strip(): - return fulltext - - # 2) Aus Chunks zusammensetzen: text → content → raw - parts: List[str] = [] - for ch in chunk_payloads: - text = ch.get("text") or ch.get("content") or ch.get("raw") - if isinstance(text, str) and text.strip(): - parts.append(text.rstrip()) - return ("\n\n".join(parts)).rstrip() + ("\n" if parts else "") - - -def _export_one_note( - client: QdrantClient, - prefix: str, - note_pl: dict, - out_root: str, - overwrite: bool, -) -> dict: - notes_col, chunks_col, _ = _names(prefix) - note_id = note_pl.get("note_id") or note_pl.get("id") - - # Pfad robust bestimmen und relativ halten - path = note_pl.get("path") or f"{note_id}.md" - path = _normalize_rel_path(path) - out_path = os.path.join(out_root, path).replace("\\", "/") - - # Frontmatter aus Payload zurückführen (nur bekannte Felder) - fm: Dict[str, object] = {} - for k in [ - "title", "id", "type", "status", "created", "updated", "tags", - "priority", "effort_min", "due", "people", "aliases", - "depends_on", "assigned_to", "lang", - ]: - v = note_pl.get(k) if k in note_pl else note_pl.get(f"note_{k}") - if v not in (None, [], ""): - fm[k] = v - - # Mindestfelder - if "id" not in fm and note_id: - fm["id"] = note_id - if "title" not in fm and note_pl.get("title"): - fm["title"] = note_pl["title"] - - # Body beschaffen - chunks = _load_chunks_for_note(client, chunks_col, note_id) - body = _reconstruct_body(note_pl, chunks) - - # Schreiben? - if os.path.exists(out_path) and not overwrite: - return {"note_id": note_id, "path": path, "status": "skip_exists"} - - _ensure_dir(out_path) - with open(out_path, "w", encoding="utf-8") as f: - f.write(_to_md(fm, body)) - - return {"note_id": note_id, "path": path, "status": "written"} - - -# ----------------------------------------------------------------------------- +# --------------------------------------------------------------------- # Main -# ----------------------------------------------------------------------------- +# --------------------------------------------------------------------- def main() -> None: ap = argparse.ArgumentParser() - ap.add_argument("--out", required=True, help="Zielordner für den Export-Vault") - ap.add_argument("--note-id", help="Nur eine Note exportieren (Note-ID)") - ap.add_argument("--overwrite", action="store_true", help="Bestehende Dateien überschreiben") - ap.add_argument("--prefix", help="(Optional) überschreibt COLLECTION_PREFIX aus ENV") + ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien") + ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)") + ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren") + ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben") + ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none", + help="Forward-Links mit exportieren (aus Edges oder Note-Payload)") + ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)") args = ap.parse_args() - # Qdrant-Konfiguration cfg = QdrantConfig.from_env() if args.prefix: - cfg.prefix = args.prefix # abwärtskompatibel: ENV bleibt Default - + cfg.prefix = args.prefix.strip() client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) - notes_col, _, _ = _names(cfg.prefix) + out_root = os.path.abspath(args.out) + os.makedirs(out_root, exist_ok=True) - # Notes holen (optional gefiltert) + notes_col, chunks_col, edges_col = collections(cfg.prefix) + + # Filter nach note-id (optional) flt = None if args.note_id: - flt = rest.Filter(must=[rest.FieldCondition( - key="note_id", - match=rest.MatchValue(value=args.note_id), - )]) + flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))]) - note_pts = _scroll_all(client, notes_col, flt, with_payload=True, with_vectors=False) - if not note_pts: - print(json.dumps({"exported": 0, "out": args.out, "message": "Keine Notes gefunden."}, ensure_ascii=False)) - return + notes = _scroll_all(client, notes_col, flt) + total = 0 + for n in notes: + pl = n.payload or {} + nid = pl.get("note_id") + rel_path = _norm_rel_path(pl.get("path") or "") + if args.flatten_paths or not rel_path: + fname = f"{(nid or 'note')}.md" + out_path = os.path.join(out_root, fname) + else: + out_path = os.path.join(out_root, rel_path) + out_path = out_path.replace("\\", "/") + _ensure_dir(out_path) - results = [] - for p in note_pts: - pl = p.payload or {} - try: - res = _export_one_note(client, cfg.prefix, pl, args.out, args.overwrite) - except Exception as e: - res = {"note_id": pl.get("note_id") or pl.get("id"), "error": str(e)} - results.append(res) + # Frontmatter aufbauen (nur sinnvolle Felder) + fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"] + fm: Dict[str, Any] = {} + fm["id"] = nid + for k in fm_fields: + if k == "id": + continue + if k in pl and pl[k] is not None: + fm[k] = pl[k] + if args.flatten_paths and rel_path: + fm["orig_path"] = rel_path - print(json.dumps({ - "exported": len([r for r in results if r.get("status") == "written"]), - "skipped": len([r for r in results if r.get("status") == "skip_exists"]), - "out": args.out, - "details": results, - }, ensure_ascii=False)) + # Body ermitteln (fulltext oder Chunks) + body = (pl.get("fulltext") or "").strip() + if not body: + flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) + chunks = _scroll_all(client, chunks_col, flt_chunks) + body = _reconstruct_body_from_chunks(chunks) + + # Edges (optional) + refs: List[str] = [] + if args.include-edges != "none": + # aus Note-Payload, falls vorhanden + if isinstance(pl.get("references"), list) and pl["references"]: + refs = [r for r in pl["references"] if isinstance(r, str)] + else: + flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) + edges = _scroll_all(client, edges_col, flt_edges) + refs = _collect_forward_refs_from_edges(edges) + if args.include-edges == "yaml" and refs: + fm["references"] = refs + + # Datei schreiben + if (not args.overwrite) and os.path.exists(out_path): + print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"})) + continue + + content = _frontmatter_block(fm) + (body + "\n" if body else "") + if args.include-edges == "footer" and refs: + content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n" + + with open(out_path, "w", encoding="utf-8") as f: + f.write(content) + + print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"})) + total += 1 + + print(f"Done. Exported notes: {total}") if __name__ == "__main__":