scripts/export_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s

This commit is contained in:
Lars 2025-09-24 12:15:34 +02:00
parent 54197995f5
commit ada7c1797d

View File

@ -1,249 +1,226 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: export_markdown.py Qdrant Markdown (Obsidian-kompatibel) Script: scripts/export_markdown.py Qdrant Markdown (Vault)
Version: 1.4.0 Version: 1.4.0
Datum: 2025-09-09 Datum: 2025-09-10
Kurzbeschreibung Funktion
---------------- --------
Exportiert Markdown-Notizen aus Qdrant in einen Zielordner (Vault). Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird
Rekonstruiert YAML-Frontmatter und Body. bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks
(Sortierung: seq chunk_index Nummer in chunk_id). Pfade werden **relativ** geschrieben.
Body-Rekonstruktions-Priorität (abwärtskompatibel): Optionen
1) notes.payload.fulltext (verlustfrei, wenn beim Import gespeichert) --------
2) ansonsten aus allen zugehörigen Chunks: payload.text payload.content payload.raw --out PATH Zielordner (erforderlich)
in stabiler, sequentieller Reihenfolge (seq/chunk_index/ID-Nummer) --prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX)
--note-id ID Nur eine Note exportieren
--overwrite Existierende Dateien überschreiben
--include-edges MODE none|yaml|footer (Default: none)
--flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path
Wichtige Fixes ENV
-------------- ---
- **Pfad-Normalisierung**: erzwingt relative Pfade (führe führende '/' ab, backslashes slashes), COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
damit ``--out`` nicht ignoriert wird.
- **`--prefix` (optional)**: Überschreibt COLLECTION_PREFIX; ENV bleibt Default (rückwärtskompatibel).
ENV / Qdrant
------------
- QDRANT_URL (oder QDRANT_HOST/QDRANT_PORT)
- QDRANT_API_KEY (optional)
- COLLECTION_PREFIX (Default: mindnet)
Aufruf
------
python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id 20250821-foo
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --prefix mindnet_dev # optional
Beispiele Beispiele
--------- ---------
COLLECTION_PREFIX=mindnet QDRANT_URL=http://127.0.0.1:6333 \\ export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault --overwrite python3 -m scripts.export_markdown --out ./_exportVault
"""
# Nur eine Note, Edges als YAML-Feld 'references'
python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml
# Flach schreiben mit Überschreiben
python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite
"""
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json
import os import os
import re import json
from typing import Dict, Iterable, List, Optional, Tuple from typing import Dict, List, Optional, Tuple, Any
import yaml import yaml
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
from qdrant_client import QdrantClient
from app.core.qdrant import QdrantConfig, get_client, ensure_collections from app.core.qdrant import QdrantConfig, get_client
from app.core.qdrant import ensure_collections # safety
# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------
# ----------------------------------------------------------------------------- def collections(prefix: str) -> Tuple[str, str, str]:
# Utilities
# -----------------------------------------------------------------------------
def _names(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _norm_rel_path(path: str) -> str:
p = (path or "").replace("\\", "/").lstrip("/")
return p if p else ""
def _ensure_dir(path: str) -> None: def _ensure_dir(path: str) -> None:
d = os.path.dirname(path) d = os.path.dirname(path)
if d and not os.path.isdir(d): if d and not os.path.exists(d):
os.makedirs(d, exist_ok=True) os.makedirs(d, exist_ok=True)
def _yaml_dump(data: Dict[str, Any]) -> str:
return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip()
def _normalize_rel_path(p: str) -> str: def _frontmatter_block(fm: Dict[str, Any]) -> str:
"""Pfad relativ halten & normalisieren (slashes, führende / entfernen).""" y = _yaml_dump(fm)
p = (p or "").replace("\\", "/") return f"---\n{y}\n---\n"
return p.lstrip("/")
def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]:
def _to_md(frontmatter: dict, body: str) -> str:
fm = yaml.safe_dump(frontmatter, sort_keys=False, allow_unicode=True).strip()
return f"---\n{fm}\n---\n{(body or '').rstrip()}\n"
def _scroll_all(
client: QdrantClient,
col: str,
flt: Optional[rest.Filter] = None,
with_payload: bool = True,
with_vectors: bool = False,
limit: int = 256,
):
"""Scrollt durch alle Punkte einer Collection und liefert eine Liste mit Points."""
out = [] out = []
next_page = None nextp = None
while True: while True:
pts, next_page = client.scroll( pts, nextp = client.scroll(
collection_name=col, collection_name=collection,
with_payload=True,
with_vectors=False,
limit=256,
scroll_filter=flt, scroll_filter=flt,
with_payload=with_payload, offset=nextp,
with_vectors=with_vectors,
limit=limit,
offset=next_page,
) )
if not pts: if not pts:
break break
out.extend(pts) out.extend(pts)
if not next_page: if nextp is None:
break break
return out return out
def _reconstruct_body_from_chunks(chunks: List[Any]) -> str:
def _load_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[dict]: def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]:
flt = rest.Filter(must=[rest.FieldCondition( s = pl.get("seq")
key="note_id", ci = pl.get("chunk_index")
match=rest.MatchValue(value=note_id), cid = pl.get("chunk_id") or ""
)]) n = 0
pts = _scroll_all(client, chunks_col, flt, with_payload=True, with_vectors=False) if isinstance(cid, str) and "#" in cid:
# Sortierung: bevorzugt seq → chunk_index → Nummer in id
def _seq(pl: dict) -> Tuple[int, int, int]:
s1 = pl.get("seq", pl.get("chunk_index", -1))
s2 = pl.get("chunk_index", -1)
# Nummer-Anteil aus "noteid#<n>"
s3 = 0
try: try:
m = re.search(r"#(\\d+)$", pl.get("id") or "") n = int(cid.rsplit("#", 1)[-1])
if m:
s3 = int(m.group(1))
except Exception: except Exception:
pass n = 0
return (int(s1) if isinstance(s1, int) else -1, int(s2) if isinstance(s2, int) else -1, s3) return (int(s) if isinstance(s, int) else 0,
int(ci) if isinstance(ci, int) else 0,
n)
chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {}))
texts: List[str] = []
for p in chunks_sorted:
pl = p.payload or {}
t = pl.get("text") or pl.get("content") or pl.get("raw") or ""
if isinstance(t, str) and t:
texts.append(t)
return "\n".join(texts).strip()
pts_sorted = sorted(pts, key=lambda p: _seq(p.payload or {})) def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]:
return [p.payload or {} for p in pts_sorted] refs = []
for p in edges:
pl = p.payload or {}
if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str):
refs.append(pl["target_id"])
# de-dupe, preserve order
seen = set()
out = []
for r in refs:
if r not in seen:
seen.add(r)
out.append(r)
return out
# ---------------------------------------------------------------------
def _reconstruct_body(note_pl: dict, chunk_payloads: List[dict]) -> str:
# 1) Volltext vorhanden?
fulltext = note_pl.get("fulltext")
if isinstance(fulltext, str) and fulltext.strip():
return fulltext
# 2) Aus Chunks zusammensetzen: text → content → raw
parts: List[str] = []
for ch in chunk_payloads:
text = ch.get("text") or ch.get("content") or ch.get("raw")
if isinstance(text, str) and text.strip():
parts.append(text.rstrip())
return ("\n\n".join(parts)).rstrip() + ("\n" if parts else "")
def _export_one_note(
client: QdrantClient,
prefix: str,
note_pl: dict,
out_root: str,
overwrite: bool,
) -> dict:
notes_col, chunks_col, _ = _names(prefix)
note_id = note_pl.get("note_id") or note_pl.get("id")
# Pfad robust bestimmen und relativ halten
path = note_pl.get("path") or f"{note_id}.md"
path = _normalize_rel_path(path)
out_path = os.path.join(out_root, path).replace("\\", "/")
# Frontmatter aus Payload zurückführen (nur bekannte Felder)
fm: Dict[str, object] = {}
for k in [
"title", "id", "type", "status", "created", "updated", "tags",
"priority", "effort_min", "due", "people", "aliases",
"depends_on", "assigned_to", "lang",
]:
v = note_pl.get(k) if k in note_pl else note_pl.get(f"note_{k}")
if v not in (None, [], ""):
fm[k] = v
# Mindestfelder
if "id" not in fm and note_id:
fm["id"] = note_id
if "title" not in fm and note_pl.get("title"):
fm["title"] = note_pl["title"]
# Body beschaffen
chunks = _load_chunks_for_note(client, chunks_col, note_id)
body = _reconstruct_body(note_pl, chunks)
# Schreiben?
if os.path.exists(out_path) and not overwrite:
return {"note_id": note_id, "path": path, "status": "skip_exists"}
_ensure_dir(out_path)
with open(out_path, "w", encoding="utf-8") as f:
f.write(_to_md(fm, body))
return {"note_id": note_id, "path": path, "status": "written"}
# -----------------------------------------------------------------------------
# Main # Main
# ----------------------------------------------------------------------------- # ---------------------------------------------------------------------
def main() -> None: def main() -> None:
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Zielordner für den Export-Vault") ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien")
ap.add_argument("--note-id", help="Nur eine Note exportieren (Note-ID)") ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
ap.add_argument("--overwrite", action="store_true", help="Bestehende Dateien überschreiben") ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren")
ap.add_argument("--prefix", help="(Optional) überschreibt COLLECTION_PREFIX aus ENV") ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none",
help="Forward-Links mit exportieren (aus Edges oder Note-Payload)")
ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)")
args = ap.parse_args() args = ap.parse_args()
# Qdrant-Konfiguration
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
if args.prefix: if args.prefix:
cfg.prefix = args.prefix # abwärtskompatibel: ENV bleibt Default cfg.prefix = args.prefix.strip()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg.prefix, cfg.dim)
notes_col, _, _ = _names(cfg.prefix) out_root = os.path.abspath(args.out)
os.makedirs(out_root, exist_ok=True)
# Notes holen (optional gefiltert) notes_col, chunks_col, edges_col = collections(cfg.prefix)
# Filter nach note-id (optional)
flt = None flt = None
if args.note_id: if args.note_id:
flt = rest.Filter(must=[rest.FieldCondition( flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))])
key="note_id",
match=rest.MatchValue(value=args.note_id),
)])
note_pts = _scroll_all(client, notes_col, flt, with_payload=True, with_vectors=False) notes = _scroll_all(client, notes_col, flt)
if not note_pts: total = 0
print(json.dumps({"exported": 0, "out": args.out, "message": "Keine Notes gefunden."}, ensure_ascii=False)) for n in notes:
return pl = n.payload or {}
nid = pl.get("note_id")
rel_path = _norm_rel_path(pl.get("path") or "")
if args.flatten_paths or not rel_path:
fname = f"{(nid or 'note')}.md"
out_path = os.path.join(out_root, fname)
else:
out_path = os.path.join(out_root, rel_path)
out_path = out_path.replace("\\", "/")
_ensure_dir(out_path)
results = [] # Frontmatter aufbauen (nur sinnvolle Felder)
for p in note_pts: fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"]
pl = p.payload or {} fm: Dict[str, Any] = {}
try: fm["id"] = nid
res = _export_one_note(client, cfg.prefix, pl, args.out, args.overwrite) for k in fm_fields:
except Exception as e: if k == "id":
res = {"note_id": pl.get("note_id") or pl.get("id"), "error": str(e)} continue
results.append(res) if k in pl and pl[k] is not None:
fm[k] = pl[k]
if args.flatten_paths and rel_path:
fm["orig_path"] = rel_path
print(json.dumps({ # Body ermitteln (fulltext oder Chunks)
"exported": len([r for r in results if r.get("status") == "written"]), body = (pl.get("fulltext") or "").strip()
"skipped": len([r for r in results if r.get("status") == "skip_exists"]), if not body:
"out": args.out, flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
"details": results, chunks = _scroll_all(client, chunks_col, flt_chunks)
}, ensure_ascii=False)) body = _reconstruct_body_from_chunks(chunks)
# Edges (optional)
refs: List[str] = []
if args.include-edges != "none":
# aus Note-Payload, falls vorhanden
if isinstance(pl.get("references"), list) and pl["references"]:
refs = [r for r in pl["references"] if isinstance(r, str)]
else:
flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
edges = _scroll_all(client, edges_col, flt_edges)
refs = _collect_forward_refs_from_edges(edges)
if args.include-edges == "yaml" and refs:
fm["references"] = refs
# Datei schreiben
if (not args.overwrite) and os.path.exists(out_path):
print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"}))
continue
content = _frontmatter_block(fm) + (body + "\n" if body else "")
if args.include-edges == "footer" and refs:
content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n"
with open(out_path, "w", encoding="utf-8") as f:
f.write(content)
print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"}))
total += 1
print(f"Done. Exported notes: {total}")
if __name__ == "__main__": if __name__ == "__main__":