scripts/export_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s

This commit is contained in:
Lars 2025-09-09 11:12:52 +02:00
parent 897d0c9e6d
commit 47797ecd29

View File

@ -1,58 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: export_markdown.py Script: export_markdown.py Qdrant Markdown (Obsidian-kompatibel)
Version: 1.3.0 Version: 1.4.0
Datum: 2025-09-09 Datum: 2025-09-09
Kurzbeschreibung Kurzbeschreibung
--------------- ----------------
Exportiert Markdown-Notizen aus Qdrant in einen Obsidian-kompatiblen Vault-Ordner. Exportiert Markdown-Notizen aus Qdrant in einen Zielordner (Vault).
Für jede Note wird die YAML-Frontmatter + Body rekonstruiert. Rekonstruiert YAML-Frontmatter und Body.
Body-Rekonstruktion (Priorität): Body-Rekonstruktions-Priorität (abwärtskompatibel):
1) Aus notes.payload.fulltext, falls vorhanden (verlustfreie Rückführung). 1) notes.payload.fulltext (verlustfrei, wenn beim Import gespeichert)
2) Andernfalls werden alle Chunks der Note geladen und deren Textfelder 2) ansonsten aus allen zugehörigen Chunks: payload.text payload.content payload.raw
(text -> content -> raw) in stabiler Reihenfolge zusammengefügt. in stabiler, sequentieller Reihenfolge (seq/chunk_index/ID-Nummer)
Wichtige Hinweise Wichtige Fixes
----------------- --------------
- Qdrant-Zugriff wird über Umgebungsvariablen konfiguriert: - **Pfad-Normalisierung**: erzwingt relative Pfade (führe führende '/' ab, backslashes slashes),
* QDRANT_URL (z. B. http://127.0.0.1:6333) damit ``--out`` nicht ignoriert wird.
* QDRANT_API_KEY (optional) - **`--prefix` (optional)**: Überschreibt COLLECTION_PREFIX; ENV bleibt Default (rückwärtskompatibel).
* COLLECTION_PREFIX (z. B. mindnet)
* VECTOR_DIM (wird hier nur zum Collection-Setup benötigt; Standard 384)
- Es wird **kein** --prefix Parameter erwartet; das Präfix kommt aus den Umgebungsvariablen.
- Exportiert nach --out. Unterordner gemäß payload['path'] werden automatisch angelegt.
- Standard: bestehende Dateien werden NICHT überschrieben; mit --overwrite schon.
Aufrufparameter ENV / Qdrant
--------------- ------------
--out PATH Ziel-Ordner (Pfad zum Export-Vault) [erforderlich] - QDRANT_URL (oder QDRANT_HOST/QDRANT_PORT)
--note-id ID Nur eine einzelne Note exportieren (optional) - QDRANT_API_KEY (optional)
--overwrite Ziel-Dateien überschreiben, falls vorhanden (optional) - COLLECTION_PREFIX (Default: mindnet)
Aufruf
------
python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id 20250821-foo
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --prefix mindnet_dev # optional
Beispiele Beispiele
--------- ---------
COLLECTION_PREFIX=mindnet QDRANT_URL=http://127.0.0.1:6333 \\ COLLECTION_PREFIX=mindnet QDRANT_URL=http://127.0.0.1:6333 \\
python3 -m scripts.export_markdown --out ./_exportVault python3 -m scripts.export_markdown --out ./_exportVault --overwrite
Nur eine Note:
COLLECTION_PREFIX=mindnet python3 -m scripts.export_markdown \\
--out ./_exportVault --note-id 20250821-architektur-ki-trainerassistent-761cfe
Mit Überschreiben:
COLLECTION_PREFIX=mindnet python3 -m scripts.export_markdown \\
--out ./_exportVault --overwrite
Änderungen (1.3.0)
------------------
- Body-Rekonstruktion robust gemacht:
* nutzt 'fulltext' aus Notes-Payload, falls vorhanden
* sonst Zusammenbau aus Chunks; Fallbacks für Textfelder: 'text' -> 'content' -> 'raw'
* stabile Sortierung der Chunks: seq -> chunk_index -> Nummer in chunk_id
- Entfernt veralteten --prefix Parameter; nutzt QdrantConfig.from_env()
- Verbesserte YAML-Ausgabe (Strings bleiben Strings), saubere Trennlinie '---'
""" """
from __future__ import annotations from __future__ import annotations
@ -61,14 +46,18 @@ import argparse
import json import json
import os import os
import re import re
from typing import List, Optional, Tuple from typing import Dict, Iterable, List, Optional, Tuple
import yaml import yaml
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
# -----------------------------------------------------------------------------
# Utilities
# -----------------------------------------------------------------------------
def _names(prefix: str) -> Tuple[str, str, str]: def _names(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
@ -80,73 +69,81 @@ def _ensure_dir(path: str) -> None:
os.makedirs(d, exist_ok=True) os.makedirs(d, exist_ok=True)
def _normalize_rel_path(p: str) -> str:
"""Pfad relativ halten & normalisieren (slashes, führende / entfernen)."""
p = (p or "").replace("\\", "/")
return p.lstrip("/")
def _to_md(frontmatter: dict, body: str) -> str: def _to_md(frontmatter: dict, body: str) -> str:
fm = yaml.safe_dump(frontmatter, sort_keys=False, allow_unicode=True).strip() fm = yaml.safe_dump(frontmatter, sort_keys=False, allow_unicode=True).strip()
# Frontmatter, dann eine leere Zeile, danach der Body return f"---\n{fm}\n---\n{(body or '').rstrip()}\n"
return f"---\n{fm}\n---\n{body.rstrip()}\n"
def _scroll_all( def _scroll_all(
client: QdrantClient, client: QdrantClient,
collection: str, col: str,
flt: Optional[rest.Filter] = None, flt: Optional[rest.Filter] = None,
with_payload: bool = True, with_payload: bool = True,
with_vectors: bool = False, with_vectors: bool = False,
limit: int = 256, limit: int = 256,
) -> List: ):
"""Holt *alle* Punkte via Scroll (QdrantClient.scroll liefert (points, next_offset)).""" """Scrollt durch alle Punkte einer Collection und liefert eine Liste mit Points."""
pts_all = [] out = []
next_offset = None next_page = None
while True: while True:
points, next_offset = client.scroll( pts, next_page = client.scroll(
collection_name=collection, collection_name=col,
scroll_filter=flt, scroll_filter=flt,
with_payload=with_payload, with_payload=with_payload,
with_vectors=with_vectors, with_vectors=with_vectors,
limit=limit, limit=limit,
offset=next_offset, offset=next_page,
) )
pts_all.extend(points or []) if not pts:
if not next_offset:
break break
return pts_all out.extend(pts)
if not next_page:
break
return out
_NUM_IN_CHUNK_ID = re.compile(r"#(?:c)?(\d+)$") def _load_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[dict]:
flt = rest.Filter(must=[rest.FieldCondition(
key="note_id",
def _chunk_sort_key(pl: dict) -> Tuple[int, int, str]: match=rest.MatchValue(value=note_id),
""" )])
Stabile Reihenfolge: pts = _scroll_all(client, chunks_col, flt, with_payload=True, with_vectors=False)
1) 'seq' (falls vorhanden), # Sortierung: bevorzugt seq → chunk_index → Nummer in id
2) 'chunk_index' (falls vorhanden), def _seq(pl: dict) -> Tuple[int, int, int]:
3) Nummer aus 'chunk_id' Suffix (#c02 -> 2), s1 = pl.get("seq", pl.get("chunk_index", -1))
4) als letzter Fallback: gesamte 'chunk_id' als string. s2 = pl.get("chunk_index", -1)
""" # Nummer-Anteil aus "noteid#<n>"
seq = pl.get("seq") s3 = 0
if isinstance(seq, int):
return (0, seq, "")
idx = pl.get("chunk_index")
if isinstance(idx, int):
return (1, idx, "")
cid = str(pl.get("chunk_id") or "")
m = _NUM_IN_CHUNK_ID.search(cid)
if m:
try: try:
return (2, int(m.group(1)), "") m = re.search(r"#(\\d+)$", pl.get("id") or "")
except ValueError: if m:
s3 = int(m.group(1))
except Exception:
pass pass
return (3, 0, cid) return (int(s1) if isinstance(s1, int) else -1, int(s2) if isinstance(s2, int) else -1, s3)
pts_sorted = sorted(pts, key=lambda p: _seq(p.payload or {}))
return [p.payload or {} for p in pts_sorted]
def _join_chunk_texts(chunks_payloads: List[dict]) -> str: def _reconstruct_body(note_pl: dict, chunk_payloads: List[dict]) -> str:
"""Nimmt die sortierten Chunk-Payloads und baut den Body zusammen.""" # 1) Volltext vorhanden?
fulltext = note_pl.get("fulltext")
if isinstance(fulltext, str) and fulltext.strip():
return fulltext
# 2) Aus Chunks zusammensetzen: text → content → raw
parts: List[str] = [] parts: List[str] = []
for pl in chunks_payloads: for ch in chunk_payloads:
txt = pl.get("text") or pl.get("content") or pl.get("raw") or "" text = ch.get("text") or ch.get("content") or ch.get("raw")
if txt: if isinstance(text, str) and text.strip():
parts.append(txt.rstrip()) parts.append(text.rstrip())
# Doppelte Leerzeile zwischen Chunks in Markdown meist ein guter Standard
return ("\n\n".join(parts)).rstrip() + ("\n" if parts else "") return ("\n\n".join(parts)).rstrip() + ("\n" if parts else "")
@ -159,87 +156,75 @@ def _export_one_note(
) -> dict: ) -> dict:
notes_col, chunks_col, _ = _names(prefix) notes_col, chunks_col, _ = _names(prefix)
note_id = note_pl.get("note_id") or note_pl.get("id") note_id = note_pl.get("note_id") or note_pl.get("id")
path = note_pl.get("path") or f"{note_id}.md"
# Zielpfad relativ zu out_root # Pfad robust bestimmen und relativ halten
path = note_pl.get("path") or f"{note_id}.md"
path = _normalize_rel_path(path)
out_path = os.path.join(out_root, path).replace("\\", "/") out_path = os.path.join(out_root, path).replace("\\", "/")
# Frontmatter aus Payload zurückführen (nur bekannte Felder) # Frontmatter aus Payload zurückführen (nur bekannte Felder)
fm = {} fm: Dict[str, object] = {}
# Bewährte Felder zurückschreiben unbekannte Keys nicht in YAML aufnehmen
for k in [ for k in [
"title", "id", "type", "status", "created", "updated", "tags", "title", "id", "type", "status", "created", "updated", "tags",
"priority", "effort_min", "due", "people", "aliases", "priority", "effort_min", "due", "people", "aliases",
"depends_on", "assigned_to", "lang" "depends_on", "assigned_to", "lang",
]: ]:
v = note_pl.get(k) if k in note_pl else note_pl.get(f"note_{k}") # Toleranz für evtl. Namensvarianten v = note_pl.get(k) if k in note_pl else note_pl.get(f"note_{k}")
if v not in (None, [], ""): if v not in (None, [], ""):
fm[k] = v fm[k] = v
# Pflichtfelder sicherstellen # Mindestfelder
fm["id"] = fm.get("id") or note_id if "id" not in fm and note_id:
fm["title"] = fm.get("title") or note_pl.get("title") or note_id fm["id"] = note_id
fm["type"] = fm.get("type") or "concept" if "title" not in fm and note_pl.get("title"):
fm["status"] = fm.get("status") or "draft" fm["title"] = note_pl["title"]
# Body-Rekonstruktion # Body beschaffen
body = "" chunks = _load_chunks_for_note(client, chunks_col, note_id)
fulltext = note_pl.get("fulltext") body = _reconstruct_body(note_pl, chunks)
if isinstance(fulltext, str) and fulltext.strip():
body = fulltext
else:
# Chunks zur Note holen und sortieren
flt = rest.Filter(must=[rest.FieldCondition(
key="note_id",
match=rest.MatchValue(value=note_id)
)])
chunk_pts = _scroll_all(client, chunks_col, flt, with_payload=True, with_vectors=False)
# Sortierung
chunk_payloads = [p.payload or {} for p in chunk_pts]
chunk_payloads.sort(key=_chunk_sort_key)
body = _join_chunk_texts(chunk_payloads)
# Ziel schreiben # Schreiben?
if (not overwrite) and os.path.exists(out_path): if os.path.exists(out_path) and not overwrite:
return {"note_id": note_id, "path": out_path, "status": "skip_exists"} return {"note_id": note_id, "path": path, "status": "skip_exists"}
_ensure_dir(out_path) _ensure_dir(out_path)
with open(out_path, "w", encoding="utf-8") as f: with open(out_path, "w", encoding="utf-8") as f:
f.write(_to_md(fm, body)) f.write(_to_md(fm, body))
return { return {"note_id": note_id, "path": path, "status": "written"}
"note_id": note_id,
"path": out_path,
"status": "written",
"body_from": "fulltext" if isinstance(fulltext, str) and fulltext.strip() else "chunks",
"chunks_used": None if (isinstance(fulltext, str) and fulltext.strip()) else len(chunk_payloads),
}
def main(): # -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
def main() -> None:
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Zielordner für den Export-Vault") ap.add_argument("--out", required=True, help="Zielordner für den Export-Vault")
ap.add_argument("--note-id", help="Nur eine Note exportieren (Note-ID)") ap.add_argument("--note-id", help="Nur eine Note exportieren (Note-ID)")
ap.add_argument("--overwrite", action="store_true", help="Bestehende Dateien überschreiben") ap.add_argument("--overwrite", action="store_true", help="Bestehende Dateien überschreiben")
ap.add_argument("--prefix", help="(Optional) überschreibt COLLECTION_PREFIX aus ENV")
args = ap.parse_args() args = ap.parse_args()
# Qdrant-Konfiguration # Qdrant-Konfiguration
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix # abwärtskompatibel: ENV bleibt Default
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg.prefix, cfg.dim)
notes_col, _, _ = _names(cfg.prefix) notes_col, _, _ = _names(cfg.prefix)
# Notes holen # Notes holen (optional gefiltert)
flt = None flt = None
if args.note_id: if args.note_id:
flt = rest.Filter(must=[rest.FieldCondition( flt = rest.Filter(must=[rest.FieldCondition(
key="note_id", key="note_id",
match=rest.MatchValue(value=args.note_id) match=rest.MatchValue(value=args.note_id),
)]) )])
note_pts = _scroll_all(client, notes_col, flt, with_payload=True, with_vectors=False) note_pts = _scroll_all(client, notes_col, flt, with_payload=True, with_vectors=False)
if not note_pts: if not note_pts:
print(json.dumps({"exported": 0, "out": args.out, "message": "Keine Notes gefunden."}, ensure_ascii=False)) print(json.dumps({"exported": 0, "out": args.out, "message": "Keine Notes gefunden."}, ensure_ascii=False))
return return
@ -253,10 +238,12 @@ def main():
res = {"note_id": pl.get("note_id") or pl.get("id"), "error": str(e)} res = {"note_id": pl.get("note_id") or pl.get("id"), "error": str(e)}
results.append(res) results.append(res)
print(json.dumps({"exported": len([r for r in results if r.get('status') == 'written']), print(json.dumps({
"skipped": len([r for r in results if r.get('status') == 'skip_exists']), "exported": len([r for r in results if r.get("status") == "written"]),
"out": args.out, "skipped": len([r for r in results if r.get("status") == "skip_exists"]),
"details": results}, ensure_ascii=False)) "out": args.out,
"details": results,
}, ensure_ascii=False))
if __name__ == "__main__": if __name__ == "__main__":