scripts/export_markdown.py hinzugefügt

2025-09-08 17:36:40 +02:00 · 2025-09-08 17:36:40 +02:00 · 1b833f76ce
commit 1b833f76ce
parent b394cadf73
1 changed files with 204 additions and 0 deletions
--- a/scripts/export_markdown.py
+++ b/scripts/export_markdown.py
@ -0,0 +1,204 @@
+# scripts/export_markdown.py
+# -----------------------------------------------------------------------------
+# Name:        export_markdown.py
+# Version:     1.0.0 (2025-09-08)
+# Zweck:       Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien.
+#
+# Was es macht:
+#   - Holt Notes aus Qdrant (alle oder gefiltert per --note-id).
+#   - Holt zugehörige Chunks (nach seq sortiert).
+#   - Baut Markdown mit YAML-Frontmatter + Body (aus Chunks zusammengeführt).
+#   - Schreibt Dateien unter --out (Verzeichnis wird angelegt).
+#   - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert.
+#
+# Aufruf:
+#   # alle Notes exportieren:
+#   python3 -m scripts.export_markdown --prefix mindnet --out ./_export
+#
+#   # nur bestimmte Note-IDs exportieren:
+#   python3 -m scripts.export_markdown --prefix mindnet --out ./_export \
+#       --note-id 20250821-architektur-ki-trainerassistent-761cfe \
+#       --note-id 20250821-personal-mind-ki-projekt-7b0d79
+#
+# Parameter:
+#   --prefix     : Collections-Präfix (Default: mindnet)
+#   --out        : Zielverzeichnis (wird erstellt)
+#   --note-id    : Kann mehrfach angegeben werden; dann nur diese Notes
+#   --overwrite  : Existierende Dateien überschreiben (sonst überspringen)
+#
+# Voraussetzungen:
+#   - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
+#   - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py
+#
+# Änderungen:
+#   - 1.0.0: Erster Release.
+# -----------------------------------------------------------------------------
+
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as rest
+
+from app.core.qdrant import QdrantConfig, get_client, collection_names
+
+
+def to_yaml_frontmatter(fm: Dict) -> str:
+    """Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil)."""
+    # Nur bekannte Felder in definierter Reihenfolge
+    ordered_keys = [
+        "id", "note_id", "title", "type", "status",
+        "created", "updated", "path", "tags",
+        "area", "project", "source", "lang", "slug",
+    ]
+    lines: List[str] = ["---"]
+    # normiere: id-Feld (falls nur note_id existiert)
+    m = dict(fm)
+    if "id" not in m and "note_id" in m:
+        m["id"] = m["note_id"]
+
+    for k in ordered_keys:
+        if k in m and m[k] is not None:
+            v = m[k]
+            if isinstance(v, list):
+                lines.append(f"{k}: [{', '.join(json.dumps(x, ensure_ascii=False) for x in v)}]")
+            else:
+                lines.append(f"{k}: {json.dumps(v, ensure_ascii=False)}")
+    lines.append("---")
+    return "\n".join(lines)
+
+
+def sanitize_filename(name: str) -> str:
+    name = name.strip().replace("/", "-")
+    name = re.sub(r"\s+", " ", name)
+    return name
+
+
+def choose_output_path(out_dir: Path, fm: Dict) -> Path:
+    # 1) payload.path bevorzugen
+    if fm.get("path"):
+        return out_dir.joinpath(fm["path"])
+    # 2) sonst sinnvolle Ableitung aus title (oder note_id)
+    base = fm.get("title") or fm.get("note_id") or "note"
+    fname = sanitize_filename(str(base)) + ".md"
+    return out_dir.joinpath(fname)
+
+
+def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]:
+    """scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts]."""
+    results: List[Dict] = []
+    offset = None
+    flt = None
+    if only_ids:
+        # Filter: note_id IN [...]
+        flt = rest.Filter(
+            should=[
+                rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))
+                for nid in only_ids
+            ]
+        )
+
+    while True:
+        pts, next_offset = client.scroll(
+            collection_name=notes_col,
+            scroll_filter=flt,
+            offset=offset,
+            limit=256,
+            with_payload=True,
+            with_vectors=False,
+        )
+        for pt in pts:
+            if pt.payload:
+                results.append(pt.payload)
+        if next_offset is None:
+            break
+        offset = next_offset
+    return results
+
+
+def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]:
+    res: List[Dict] = []
+    offset = None
+    flt = rest.Filter(
+        must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
+    )
+    while True:
+        pts, next_offset = client.scroll(
+            collection_name=chunks_col,
+            scroll_filter=flt,
+            offset=offset,
+            limit=256,
+            with_payload=True,
+            with_vectors=False,
+        )
+        for pt in pts:
+            if pt.payload:
+                res.append(pt.payload)
+        if next_offset is None:
+            break
+        offset = next_offset
+
+    # sortiere nach seq, falls vorhanden
+    res.sort(key=lambda x: x.get("seq", 0))
+    return res
+
+
+def assemble_body_from_chunks(chunks: List[Dict]) -> str:
+    parts: List[str] = []
+    for ch in chunks:
+        t = ch.get("text") or ""
+        parts.append(str(t))
+    return "\n\n".join(parts).rstrip() + "\n"
+
+
+def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict], overwrite: bool) -> Path:
+    out_path = choose_output_path(out_dir, note_payload)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if out_path.exists() and not overwrite:
+        return out_path
+
+    frontmatter = to_yaml_frontmatter(note_payload)
+    body = assemble_body_from_chunks(chunks)
+
+    content = f"{frontmatter}\n{body}"
+    out_path.write_text(content, encoding="utf-8")
+    return out_path
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.")
+    ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet)")
+    ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien")
+    ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)")
+    ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
+    args = ap.parse_args()
+
+    out_dir = Path(args.out).resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    cfg = QdrantConfig()
+    client = get_client(cfg)
+    cols = collection_names(args.prefix)
+
+    notes = fetch_all_notes(client, cols["notes"], args.note_id)
+    if not notes:
+        print("Keine Notes in Qdrant gefunden (Filter zu streng?).")
+        return
+
+    exported = []
+    for np in notes:
+        nid = np.get("note_id") or np.get("id")
+        chunks = fetch_chunks_for_note(client, cols["chunks"], note_id=str(nid))
+        path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite)
+        exported.append({"note_id": nid, "path": str(path)})
+
+    print(json.dumps({"exported": exported}, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()