diff --git a/scripts/export_markdown.py b/scripts/export_markdown.py new file mode 100644 index 0000000..c5af156 --- /dev/null +++ b/scripts/export_markdown.py @@ -0,0 +1,204 @@ +# scripts/export_markdown.py +# ----------------------------------------------------------------------------- +# Name: export_markdown.py +# Version: 1.0.0 (2025-09-08) +# Zweck: Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien. +# +# Was es macht: +# - Holt Notes aus Qdrant (alle oder gefiltert per --note-id). +# - Holt zugehörige Chunks (nach seq sortiert). +# - Baut Markdown mit YAML-Frontmatter + Body (aus Chunks zusammengeführt). +# - Schreibt Dateien unter --out (Verzeichnis wird angelegt). +# - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert. +# +# Aufruf: +# # alle Notes exportieren: +# python3 -m scripts.export_markdown --prefix mindnet --out ./_export +# +# # nur bestimmte Note-IDs exportieren: +# python3 -m scripts.export_markdown --prefix mindnet --out ./_export \ +# --note-id 20250821-architektur-ki-trainerassistent-761cfe \ +# --note-id 20250821-personal-mind-ki-projekt-7b0d79 +# +# Parameter: +# --prefix : Collections-Präfix (Default: mindnet) +# --out : Zielverzeichnis (wird erstellt) +# --note-id : Kann mehrfach angegeben werden; dann nur diese Notes +# --overwrite : Existierende Dateien überschreiben (sonst überspringen) +# +# Voraussetzungen: +# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate +# - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py +# +# Änderungen: +# - 1.0.0: Erster Release. +# ----------------------------------------------------------------------------- + +import argparse +import json +import os +import re +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest + +from app.core.qdrant import QdrantConfig, get_client, collection_names + + +def to_yaml_frontmatter(fm: Dict) -> str: + """Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil).""" + # Nur bekannte Felder in definierter Reihenfolge + ordered_keys = [ + "id", "note_id", "title", "type", "status", + "created", "updated", "path", "tags", + "area", "project", "source", "lang", "slug", + ] + lines: List[str] = ["---"] + # normiere: id-Feld (falls nur note_id existiert) + m = dict(fm) + if "id" not in m and "note_id" in m: + m["id"] = m["note_id"] + + for k in ordered_keys: + if k in m and m[k] is not None: + v = m[k] + if isinstance(v, list): + lines.append(f"{k}: [{', '.join(json.dumps(x, ensure_ascii=False) for x in v)}]") + else: + lines.append(f"{k}: {json.dumps(v, ensure_ascii=False)}") + lines.append("---") + return "\n".join(lines) + + +def sanitize_filename(name: str) -> str: + name = name.strip().replace("/", "-") + name = re.sub(r"\s+", " ", name) + return name + + +def choose_output_path(out_dir: Path, fm: Dict) -> Path: + # 1) payload.path bevorzugen + if fm.get("path"): + return out_dir.joinpath(fm["path"]) + # 2) sonst sinnvolle Ableitung aus title (oder note_id) + base = fm.get("title") or fm.get("note_id") or "note" + fname = sanitize_filename(str(base)) + ".md" + return out_dir.joinpath(fname) + + +def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]: + """scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts].""" + results: List[Dict] = [] + offset = None + flt = None + if only_ids: + # Filter: note_id IN [...] + flt = rest.Filter( + should=[ + rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)) + for nid in only_ids + ] + ) + + while True: + pts, next_offset = client.scroll( + collection_name=notes_col, + scroll_filter=flt, + offset=offset, + limit=256, + with_payload=True, + with_vectors=False, + ) + for pt in pts: + if pt.payload: + results.append(pt.payload) + if next_offset is None: + break + offset = next_offset + return results + + +def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]: + res: List[Dict] = [] + offset = None + flt = rest.Filter( + must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))] + ) + while True: + pts, next_offset = client.scroll( + collection_name=chunks_col, + scroll_filter=flt, + offset=offset, + limit=256, + with_payload=True, + with_vectors=False, + ) + for pt in pts: + if pt.payload: + res.append(pt.payload) + if next_offset is None: + break + offset = next_offset + + # sortiere nach seq, falls vorhanden + res.sort(key=lambda x: x.get("seq", 0)) + return res + + +def assemble_body_from_chunks(chunks: List[Dict]) -> str: + parts: List[str] = [] + for ch in chunks: + t = ch.get("text") or "" + parts.append(str(t)) + return "\n\n".join(parts).rstrip() + "\n" + + +def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict], overwrite: bool) -> Path: + out_path = choose_output_path(out_dir, note_payload) + out_path.parent.mkdir(parents=True, exist_ok=True) + + if out_path.exists() and not overwrite: + return out_path + + frontmatter = to_yaml_frontmatter(note_payload) + body = assemble_body_from_chunks(chunks) + + content = f"{frontmatter}\n{body}" + out_path.write_text(content, encoding="utf-8") + return out_path + + +def main(): + ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.") + ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet)") + ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien") + ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)") + ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben") + args = ap.parse_args() + + out_dir = Path(args.out).resolve() + out_dir.mkdir(parents=True, exist_ok=True) + + cfg = QdrantConfig() + client = get_client(cfg) + cols = collection_names(args.prefix) + + notes = fetch_all_notes(client, cols["notes"], args.note_id) + if not notes: + print("Keine Notes in Qdrant gefunden (Filter zu streng?).") + return + + exported = [] + for np in notes: + nid = np.get("note_id") or np.get("id") + chunks = fetch_chunks_for_note(client, cols["chunks"], note_id=str(nid)) + path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite) + exported.append({"note_id": nid, "path": str(path)}) + + print(json.dumps({"exported": exported}, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main()