# scripts/export_markdown.py # ----------------------------------------------------------------------------- # Name: export_markdown.py # Version: 1.0.0 (2025-09-08) # Zweck: Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien. # # Was es macht: # - Holt Notes aus Qdrant (alle oder gefiltert per --note-id). # - Holt zugehörige Chunks (nach seq sortiert). # - Baut Markdown mit YAML-Frontmatter + Body (aus Chunks zusammengeführt). # - Schreibt Dateien unter --out (Verzeichnis wird angelegt). # - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert. # # Aufruf: # # alle Notes exportieren: # python3 -m scripts.export_markdown --prefix mindnet --out ./_export # # # nur bestimmte Note-IDs exportieren: # python3 -m scripts.export_markdown --prefix mindnet --out ./_export \ # --note-id 20250821-architektur-ki-trainerassistent-761cfe \ # --note-id 20250821-personal-mind-ki-projekt-7b0d79 # # Parameter: # --prefix : Collections-Präfix (Default: mindnet) # --out : Zielverzeichnis (wird erstellt) # --note-id : Kann mehrfach angegeben werden; dann nur diese Notes # --overwrite : Existierende Dateien überschreiben (sonst überspringen) # # Voraussetzungen: # - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate # - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py # # Änderungen: # - 1.0.0: Erster Release. # ----------------------------------------------------------------------------- import argparse import json import os import re from pathlib import Path from typing import Dict, List, Optional, Tuple from qdrant_client import QdrantClient from qdrant_client.http import models as rest from app.core.qdrant import QdrantConfig, get_client, collection_names def to_yaml_frontmatter(fm: Dict) -> str: """Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil).""" # Nur bekannte Felder in definierter Reihenfolge ordered_keys = [ "id", "note_id", "title", "type", "status", "created", "updated", "path", "tags", "area", "project", "source", "lang", "slug", ] lines: List[str] = ["---"] # normiere: id-Feld (falls nur note_id existiert) m = dict(fm) if "id" not in m and "note_id" in m: m["id"] = m["note_id"] for k in ordered_keys: if k in m and m[k] is not None: v = m[k] if isinstance(v, list): lines.append(f"{k}: [{', '.join(json.dumps(x, ensure_ascii=False) for x in v)}]") else: lines.append(f"{k}: {json.dumps(v, ensure_ascii=False)}") lines.append("---") return "\n".join(lines) def sanitize_filename(name: str) -> str: name = name.strip().replace("/", "-") name = re.sub(r"\s+", " ", name) return name def choose_output_path(out_dir: Path, fm: Dict) -> Path: # 1) payload.path bevorzugen if fm.get("path"): return out_dir.joinpath(fm["path"]) # 2) sonst sinnvolle Ableitung aus title (oder note_id) base = fm.get("title") or fm.get("note_id") or "note" fname = sanitize_filename(str(base)) + ".md" return out_dir.joinpath(fname) def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]: """scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts].""" results: List[Dict] = [] offset = None flt = None if only_ids: # Filter: note_id IN [...] flt = rest.Filter( should=[ rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)) for nid in only_ids ] ) while True: pts, next_offset = client.scroll( collection_name=notes_col, scroll_filter=flt, offset=offset, limit=256, with_payload=True, with_vectors=False, ) for pt in pts: if pt.payload: results.append(pt.payload) if next_offset is None: break offset = next_offset return results def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]: res: List[Dict] = [] offset = None flt = rest.Filter( must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))] ) while True: pts, next_offset = client.scroll( collection_name=chunks_col, scroll_filter=flt, offset=offset, limit=256, with_payload=True, with_vectors=False, ) for pt in pts: if pt.payload: res.append(pt.payload) if next_offset is None: break offset = next_offset # sortiere nach seq, falls vorhanden res.sort(key=lambda x: x.get("seq", 0)) return res def assemble_body_from_chunks(chunks: List[Dict]) -> str: parts: List[str] = [] for ch in chunks: t = ch.get("text") or "" parts.append(str(t)) return "\n\n".join(parts).rstrip() + "\n" def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict], overwrite: bool) -> Path: out_path = choose_output_path(out_dir, note_payload) out_path.parent.mkdir(parents=True, exist_ok=True) if out_path.exists() and not overwrite: return out_path frontmatter = to_yaml_frontmatter(note_payload) body = assemble_body_from_chunks(chunks) content = f"{frontmatter}\n{body}" out_path.write_text(content, encoding="utf-8") return out_path def main(): ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.") ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet)") ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien") ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)") ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben") args = ap.parse_args() out_dir = Path(args.out).resolve() out_dir.mkdir(parents=True, exist_ok=True) cfg = QdrantConfig() client = get_client(cfg) cols = collection_names(args.prefix) notes = fetch_all_notes(client, cols["notes"], args.note_id) if not notes: print("Keine Notes in Qdrant gefunden (Filter zu streng?).") return exported = [] for np in notes: nid = np.get("note_id") or np.get("id") chunks = fetch_chunks_for_note(client, cols["chunks"], note_id=str(nid)) path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite) exported.append({"note_id": nid, "path": str(path)}) print(json.dumps({"exported": exported}, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()