scripts/export_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s

This commit is contained in:
Lars 2025-09-09 09:33:51 +02:00
parent 8f97f42618
commit c2954b5663

View File

@ -1,202 +1,284 @@
# scripts/export_markdown.py #!/usr/bin/env python3
# ----------------------------------------------------------------------------- # -*- coding: utf-8 -*-
# Name: export_markdown.py """
# Version: 1.0.1 (2025-09-08) export_markdown.py Export aus Qdrant Markdown (Vault-Struktur)
# Zweck: Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien.
#
# Was es macht:
# - Holt Notes aus Qdrant (alle oder gefiltert per --note-id).
# - Holt zugehörige Chunks (nach seq sortiert).
# - Baut Markdown mit YAML-Frontmatter + Body (aus Chunks zusammengeführt).
# - Schreibt Dateien unter --out (Verzeichnis wird angelegt).
# - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert.
#
# Aufruf (im venv):
# # alle Notes exportieren (Prefix wird aus ENV COLLECTION_PREFIX gelesen):
# python3 -m scripts.export_markdown --out ./_export
#
# # Prefix explizit per ENV überschreiben:
# COLLECTION_PREFIX=mindnet python3 -m scripts.export_markdown --out ./_export
#
# # nur bestimmte Note-IDs exportieren:
# python3 -m scripts.export_markdown --out ./_export \
# --note-id 20250821-architektur-ki-trainerassistent-761cfe \
# --note-id 20250821-personal-mind-ki-projekt-7b0d79
#
# Parameter:
# --out : Zielverzeichnis (wird erstellt, Pflicht)
# --note-id : Kann mehrfach angegeben werden; dann nur diese Notes
# --overwrite : Existierende Dateien überschreiben (sonst überspringen)
#
# Umgebung:
# QDRANT_URL (z. B. http://127.0.0.1:6333)
# QDRANT_API_KEY (optional)
# COLLECTION_PREFIX (Default in app/core/qdrant.py: "mindnet")
# VECTOR_DIM (Default in app/core/qdrant.py: 384)
#
# Voraussetzungen:
# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
# - Qdrant läuft (oder URL/API-Key in ENV), siehe app/core/qdrant.py
#
# Änderungen:
# - 1.0.1: Nutzt QdrantConfig.from_env() ohne Parameter; liest Prefix aus ENV.
# Passt collection_names()-Nutzung (Tupel) korrekt an.
# - 1.0.0: Erster Release.
# -----------------------------------------------------------------------------
Version: 1.3 (2025-09-09)
Kurzbeschreibung
- Exportiert Notes (+ rekonstruierten Body aus Chunks) aus Qdrant in Dateien mit YAML-Frontmatter.
- Nutzt ENV-Variablen aus .env (QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM).
- Optionales CLI-Argument --prefix überschreibt COLLECTION_PREFIX, damit alle Tools konsistent sind.
- Unterstützung von Mehrfachauswahl per --note-id (mehrfach angeben).
Anwendungsfälle
- Kompletter Vault-Neuaufbau aus Qdrant
- Teil-Export einzelner Notizen
- Sicherung / Migration
Voraussetzungen
- Aktiviertes venv (empfohlen): `source .venv/bin/activate`
- Laufender Qdrant (URL/API-Key passend zu deiner Umgebung)
- Sammlungen: <prefix>_notes, <prefix>_chunks
- Chunk-Payload enthält Text in `text` (Fallback: `raw`), Reihenfolge über `seq` oder Nummer in `chunk_id`.
Aufrufe (Beispiele)
- Prefix über ENV (empfohlen):
export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault
- Prefix über CLI (überschreibt ENV):
python3 -m scripts.export_markdown --out ./_exportVault --prefix mindnet
- Nur bestimmte Notizen exportieren:
python3 -m scripts.export_markdown --out ./_exportVault \
--prefix mindnet \
--note-id 20250821-architektur-ki-trainerassistent-761cfe \
--note-id 20250821-personal-mind-ki-projekt-7b0d79
- Existierende Dateien überschreiben:
python3 -m scripts.export_markdown --out ./_exportVault --prefix mindnet --overwrite
Parameter
- --out PATH (Pflicht) Ziel-Verzeichnis des Export-Vaults (wird angelegt).
- --prefix TEXT (Optional) Collection-Prefix; überschreibt ENV COLLECTION_PREFIX.
- --note-id ID (Optional, mehrfach) Export auf bestimmte Note-IDs begrenzen.
- --overwrite (Optional) Bereits existierende Dateien überschreiben (default: skip).
- --dry-run (Optional) Nur anzeigen, was geschrieben würde; keine Dateien anlegen.
Änderungen ggü. v1.2
- Neues optionales CLI-Argument --prefix (ENV-Fallback bleibt).
- Robustere Qdrant-Scroll-Logik (neue Client-Signatur: (points, next_offset)).
- Verbesserte Sortierung der Chunks (seq > Nummer aus chunk_id > Fallback).
- Defensiver Umgang mit Frontmatter (nur sinnvolle Felder; Datumswerte als Strings).
"""
from __future__ import annotations
import os
import sys
import argparse import argparse
import json import json
import os
import re
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional, Tuple
from qdrant_client import QdrantClient from dotenv import load_dotenv
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client, collection_names from app.core.qdrant import QdrantConfig, get_client
# ensure_collections ist für Export nicht nötig
# -------------------------
# Hilfsfunktionen
# -------------------------
def collections(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def to_yaml_frontmatter(fm: Dict) -> str: def scroll_all(client, collection: str, flt: Optional[rest.Filter] = None, with_payload=True, with_vectors=False):
"""Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil).""" """Iterator über alle Punkte einer Collection (neue Qdrant-Client-Signatur)."""
ordered_keys = [ next_offset = None
"id", "note_id", "title", "type", "status", while True:
"created", "updated", "path", "tags", points, next_offset = client.scroll(
"area", "project", "source", "lang", "slug", collection_name=collection,
] scroll_filter=flt,
lines: List[str] = ["---"] with_payload=with_payload,
m = dict(fm) with_vectors=with_vectors,
if "id" not in m and "note_id" in m: limit=256,
m["id"] = m["note_id"] offset=next_offset,
)
for p in points:
yield p
if next_offset is None:
break
for k in ordered_keys:
if k in m and m[k] is not None: def ensure_dir(path: Path):
v = m[k] path.parent.mkdir(parents=True, exist_ok=True)
if isinstance(v, list):
lines.append(f"{k}: [{', '.join(json.dumps(x, ensure_ascii=False) for x in v)}]")
def select_frontmatter(note_pl: Dict) -> Dict:
"""
Reduziert Payload auf für den Vault sinnvolle Frontmatter-Felder.
Pflichtfelder laut Schema: note_id (id), title, type, status, created, path
Optional: updated, tags
"""
# Backward-compat: manche Payloads nutzen 'id' statt 'note_id'
note_id = note_pl.get("note_id") or note_pl.get("id")
fm = {
"id": note_id,
"title": note_pl.get("title"),
"type": note_pl.get("type"),
"status": note_pl.get("status"),
"created": note_pl.get("created"),
"path": note_pl.get("path"),
}
# optional
if note_pl.get("updated") is not None:
fm["updated"] = note_pl.get("updated")
if note_pl.get("tags"):
fm["tags"] = note_pl.get("tags")
return fm
def yaml_block(frontmatter: Dict) -> str:
"""
Sehr einfache YAML-Serialisierung (ohne zusätzliche Abhängigkeiten).
Annahme: Werte sind Strings/Listen; Datumsangaben bereits als Strings.
"""
lines = ["---"]
for k, v in frontmatter.items():
if v is None:
continue
if isinstance(v, list):
# einfache Listen-Notation
lines.append(f"{k}:")
for item in v:
lines.append(f" - {item}")
else:
# Strings ggf. doppelt quoten, wenn Sonderzeichen enthalten
s = str(v)
if any(ch in s for ch in [":", "-", "#", "{", "}", "[", "]", ","]):
lines.append(f'{k}: "{s}"')
else: else:
lines.append(f"{k}: {json.dumps(v, ensure_ascii=False)}") lines.append(f"{k}: {s}")
lines.append("---") lines.append("---")
return "\n".join(lines) return "\n".join(lines)
def sanitize_filename(name: str) -> str: def chunk_sort_key(pl: Dict) -> Tuple[int, int]:
name = name.strip().replace("/", "-") """
name = re.sub(r"\s+", " ", name) Bestimme eine stabile Sortierreihenfolge:
return name 1) seq (falls vorhanden)
2) Nummer aus chunk_id (#<n>)
3) Fallback: 0
"""
seq = pl.get("seq")
if isinstance(seq, int):
return (0, seq)
cid = pl.get("chunk_id") or pl.get("id") or ""
n = 0
if "#" in cid:
try:
n = int(cid.split("#", 1)[1])
except ValueError:
n = 0
return (1, n)
def choose_output_path(out_dir: Path, fm: Dict) -> Path: def reconstruct_body(chunk_payloads: List[Dict]) -> str:
# 1) payload.path bevorzugen
if fm.get("path"):
return out_dir.joinpath(fm["path"])
# 2) sonst sinnvolle Ableitung aus title (oder note_id)
base = fm.get("title") or fm.get("note_id") or "note"
fname = sanitize_filename(str(base)) + ".md"
return out_dir.joinpath(fname)
def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]:
"""Scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts]."""
results: List[Dict] = []
offset = None
flt = None
if only_ids:
flt = rest.Filter(
should=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)) for nid in only_ids]
)
while True:
pts, next_offset = client.scroll(
collection_name=notes_col,
scroll_filter=flt,
offset=offset,
limit=256,
with_payload=True,
with_vectors=False,
)
for pt in pts:
if pt.payload:
results.append(pt.payload)
if next_offset is None:
break
offset = next_offset
return results
def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]:
res: List[Dict] = []
offset = None
flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
while True:
pts, next_offset = client.scroll(
collection_name=chunks_col,
scroll_filter=flt,
offset=offset,
limit=256,
with_payload=True,
with_vectors=False,
)
for pt in pts:
if pt.payload:
res.append(pt.payload)
if next_offset is None:
break
offset = next_offset
res.sort(key=lambda x: x.get("seq", 0))
return res
def assemble_body_from_chunks(chunks: List[Dict]) -> str:
parts: List[str] = [] parts: List[str] = []
for ch in chunks: chunk_payloads_sorted = sorted(chunk_payloads, key=chunk_sort_key)
t = ch.get("text") or "" for pl in chunk_payloads_sorted:
parts.append(str(t)) txt = pl.get("text") or pl.get("raw") or ""
parts.append(txt.rstrip("\n"))
return "\n\n".join(parts).rstrip() + "\n" return "\n\n".join(parts).rstrip() + "\n"
def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict], overwrite: bool) -> Path: def safe_write(out_path: Path, content: str, overwrite: bool) -> str:
out_path = choose_output_path(out_dir, note_payload)
out_path.parent.mkdir(parents=True, exist_ok=True)
if out_path.exists() and not overwrite: if out_path.exists() and not overwrite:
return out_path return "skip"
ensure_dir(out_path)
frontmatter = to_yaml_frontmatter(note_payload)
body = assemble_body_from_chunks(chunks)
content = f"{frontmatter}\n{body}"
out_path.write_text(content, encoding="utf-8") out_path.write_text(content, encoding="utf-8")
return out_path return "write"
def fetch_note_chunks(client, chunks_col: str, note_id: str) -> List[Dict]:
flt = rest.Filter(
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
)
out: List[Dict] = []
for p in scroll_all(client, chunks_col, flt, with_payload=True, with_vectors=False):
if p.payload:
out.append(p.payload)
return out
def make_export_path(export_root: Path, note_pl: Dict) -> Path:
# prefer payload 'path', sonst Titel-basierte Fallback-Datei
rel = (note_pl.get("path") or f"{note_pl.get('title') or 'Note'}.md").strip("/")
# Normalisierung Windows-Backslashes etc.
rel = rel.replace("\\", "/")
return export_root.joinpath(rel)
# -------------------------
# Main
# -------------------------
def main(): def main():
ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.") load_dotenv()
ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien")
ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)") ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Ziel-Verzeichnis für den Export-Vault")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
ap.add_argument("--note-id", action="append", help="Nur bestimmte Note-ID exportieren (mehrfach möglich)")
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben") ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
ap.add_argument("--dry-run", action="store_true", help="Nur anzeigen, keine Dateien schreiben")
args = ap.parse_args() args = ap.parse_args()
out_dir = Path(args.out).resolve() # Qdrant-Konfiguration
out_dir.mkdir(parents=True, exist_ok=True)
# Wichtig: Prefix & Co. kommen aus ENV via from_env()
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
if args.prefix:
# CLI-Präfix hat Vorrang
cfg.prefix = args.prefix
client = get_client(cfg) client = get_client(cfg)
notes_col, chunks_col, _edges_col = collection_names(cfg.prefix) notes_col, chunks_col, _ = collections(cfg.prefix)
notes = fetch_all_notes(client, notes_col, args.note_id) # Filter für Noten-Auswahl
if not notes: note_filter: Optional[rest.Filter] = None
print("Keine Notes in Qdrant gefunden (oder Filter zu streng).") if args.note_id:
return should = [rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)) for nid in args.note_id]
note_filter = rest.Filter(should=should)
exported = [] export_root = Path(args.out).resolve()
for np in notes: export_root.mkdir(parents=True, exist_ok=True)
nid = np.get("note_id") or np.get("id")
chunks = fetch_chunks_for_note(client, chunks_col, note_id=str(nid))
path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite)
exported.append({"note_id": nid, "path": str(path)})
print(json.dumps({"exported": exported}, ensure_ascii=False, indent=2)) total = 0
written = 0
skipped = 0
# Notes aus Qdrant holen
for p in scroll_all(client, notes_col, note_filter, with_payload=True, with_vectors=False):
pl = p.payload or {}
note_id = pl.get("note_id") or pl.get("id")
title = pl.get("title")
# Frontmatter & Body
fm = select_frontmatter(pl)
yaml = yaml_block(fm)
chunks = fetch_note_chunks(client, chunks_col, note_id)
body = reconstruct_body(chunks)
content = f"{yaml}\n{body}"
out_path = make_export_path(export_root, pl)
decision = "dry-run"
if not args.dry_run:
decision = safe_write(out_path, content, args.overwrite)
if decision == "write":
written += 1
elif decision == "skip":
skipped += 1
total += 1
print(json.dumps({
"note_id": note_id,
"title": title,
"file": str(out_path),
"chunks": len(chunks),
"decision": decision
}, ensure_ascii=False))
print(json.dumps({
"summary": {
"notes_total": total,
"written": written,
"skipped": skipped,
"out_dir": str(export_root)
}
}, ensure_ascii=False))
if __name__ == "__main__": if __name__ == "__main__":