scripts/export_markdown.py hinzugefügt
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
b394cadf73
commit
1b833f76ce
204
scripts/export_markdown.py
Normal file
204
scripts/export_markdown.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
# scripts/export_markdown.py
|
||||
# -----------------------------------------------------------------------------
|
||||
# Name: export_markdown.py
|
||||
# Version: 1.0.0 (2025-09-08)
|
||||
# Zweck: Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien.
|
||||
#
|
||||
# Was es macht:
|
||||
# - Holt Notes aus Qdrant (alle oder gefiltert per --note-id).
|
||||
# - Holt zugehörige Chunks (nach seq sortiert).
|
||||
# - Baut Markdown mit YAML-Frontmatter + Body (aus Chunks zusammengeführt).
|
||||
# - Schreibt Dateien unter --out (Verzeichnis wird angelegt).
|
||||
# - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert.
|
||||
#
|
||||
# Aufruf:
|
||||
# # alle Notes exportieren:
|
||||
# python3 -m scripts.export_markdown --prefix mindnet --out ./_export
|
||||
#
|
||||
# # nur bestimmte Note-IDs exportieren:
|
||||
# python3 -m scripts.export_markdown --prefix mindnet --out ./_export \
|
||||
# --note-id 20250821-architektur-ki-trainerassistent-761cfe \
|
||||
# --note-id 20250821-personal-mind-ki-projekt-7b0d79
|
||||
#
|
||||
# Parameter:
|
||||
# --prefix : Collections-Präfix (Default: mindnet)
|
||||
# --out : Zielverzeichnis (wird erstellt)
|
||||
# --note-id : Kann mehrfach angegeben werden; dann nur diese Notes
|
||||
# --overwrite : Existierende Dateien überschreiben (sonst überspringen)
|
||||
#
|
||||
# Voraussetzungen:
|
||||
# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
|
||||
# - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py
|
||||
#
|
||||
# Änderungen:
|
||||
# - 1.0.0: Erster Release.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
from app.core.qdrant import QdrantConfig, get_client, collection_names
|
||||
|
||||
|
||||
def to_yaml_frontmatter(fm: Dict) -> str:
|
||||
"""Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil)."""
|
||||
# Nur bekannte Felder in definierter Reihenfolge
|
||||
ordered_keys = [
|
||||
"id", "note_id", "title", "type", "status",
|
||||
"created", "updated", "path", "tags",
|
||||
"area", "project", "source", "lang", "slug",
|
||||
]
|
||||
lines: List[str] = ["---"]
|
||||
# normiere: id-Feld (falls nur note_id existiert)
|
||||
m = dict(fm)
|
||||
if "id" not in m and "note_id" in m:
|
||||
m["id"] = m["note_id"]
|
||||
|
||||
for k in ordered_keys:
|
||||
if k in m and m[k] is not None:
|
||||
v = m[k]
|
||||
if isinstance(v, list):
|
||||
lines.append(f"{k}: [{', '.join(json.dumps(x, ensure_ascii=False) for x in v)}]")
|
||||
else:
|
||||
lines.append(f"{k}: {json.dumps(v, ensure_ascii=False)}")
|
||||
lines.append("---")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
name = name.strip().replace("/", "-")
|
||||
name = re.sub(r"\s+", " ", name)
|
||||
return name
|
||||
|
||||
|
||||
def choose_output_path(out_dir: Path, fm: Dict) -> Path:
|
||||
# 1) payload.path bevorzugen
|
||||
if fm.get("path"):
|
||||
return out_dir.joinpath(fm["path"])
|
||||
# 2) sonst sinnvolle Ableitung aus title (oder note_id)
|
||||
base = fm.get("title") or fm.get("note_id") or "note"
|
||||
fname = sanitize_filename(str(base)) + ".md"
|
||||
return out_dir.joinpath(fname)
|
||||
|
||||
|
||||
def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]:
|
||||
"""scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts]."""
|
||||
results: List[Dict] = []
|
||||
offset = None
|
||||
flt = None
|
||||
if only_ids:
|
||||
# Filter: note_id IN [...]
|
||||
flt = rest.Filter(
|
||||
should=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))
|
||||
for nid in only_ids
|
||||
]
|
||||
)
|
||||
|
||||
while True:
|
||||
pts, next_offset = client.scroll(
|
||||
collection_name=notes_col,
|
||||
scroll_filter=flt,
|
||||
offset=offset,
|
||||
limit=256,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
for pt in pts:
|
||||
if pt.payload:
|
||||
results.append(pt.payload)
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
return results
|
||||
|
||||
|
||||
def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]:
|
||||
res: List[Dict] = []
|
||||
offset = None
|
||||
flt = rest.Filter(
|
||||
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
|
||||
)
|
||||
while True:
|
||||
pts, next_offset = client.scroll(
|
||||
collection_name=chunks_col,
|
||||
scroll_filter=flt,
|
||||
offset=offset,
|
||||
limit=256,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
for pt in pts:
|
||||
if pt.payload:
|
||||
res.append(pt.payload)
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
# sortiere nach seq, falls vorhanden
|
||||
res.sort(key=lambda x: x.get("seq", 0))
|
||||
return res
|
||||
|
||||
|
||||
def assemble_body_from_chunks(chunks: List[Dict]) -> str:
|
||||
parts: List[str] = []
|
||||
for ch in chunks:
|
||||
t = ch.get("text") or ""
|
||||
parts.append(str(t))
|
||||
return "\n\n".join(parts).rstrip() + "\n"
|
||||
|
||||
|
||||
def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict], overwrite: bool) -> Path:
|
||||
out_path = choose_output_path(out_dir, note_payload)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if out_path.exists() and not overwrite:
|
||||
return out_path
|
||||
|
||||
frontmatter = to_yaml_frontmatter(note_payload)
|
||||
body = assemble_body_from_chunks(chunks)
|
||||
|
||||
content = f"{frontmatter}\n{body}"
|
||||
out_path.write_text(content, encoding="utf-8")
|
||||
return out_path
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.")
|
||||
ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet)")
|
||||
ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien")
|
||||
ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)")
|
||||
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
|
||||
args = ap.parse_args()
|
||||
|
||||
out_dir = Path(args.out).resolve()
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cfg = QdrantConfig()
|
||||
client = get_client(cfg)
|
||||
cols = collection_names(args.prefix)
|
||||
|
||||
notes = fetch_all_notes(client, cols["notes"], args.note_id)
|
||||
if not notes:
|
||||
print("Keine Notes in Qdrant gefunden (Filter zu streng?).")
|
||||
return
|
||||
|
||||
exported = []
|
||||
for np in notes:
|
||||
nid = np.get("note_id") or np.get("id")
|
||||
chunks = fetch_chunks_for_note(client, cols["chunks"], note_id=str(nid))
|
||||
path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite)
|
||||
exported.append({"note_id": nid, "path": str(path)})
|
||||
|
||||
print(json.dumps({"exported": exported}, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user