scripts/export_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
af5d0a0c91
commit
aa7d0190c8
|
|
@ -1,7 +1,7 @@
|
||||||
# scripts/export_markdown.py
|
# scripts/export_markdown.py
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Name: export_markdown.py
|
# Name: export_markdown.py
|
||||||
# Version: 1.0.0 (2025-09-08)
|
# Version: 1.0.1 (2025-09-08)
|
||||||
# Zweck: Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien.
|
# Zweck: Exportiert Notes + Chunks aus Qdrant zurück in Markdown-Dateien.
|
||||||
#
|
#
|
||||||
# Was es macht:
|
# Was es macht:
|
||||||
|
|
@ -11,26 +11,36 @@
|
||||||
# - Schreibt Dateien unter --out (Verzeichnis wird angelegt).
|
# - Schreibt Dateien unter --out (Verzeichnis wird angelegt).
|
||||||
# - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert.
|
# - Verwendet, falls vorhanden, den Pfad aus payload.path; sonst Titel-basiert.
|
||||||
#
|
#
|
||||||
# Aufruf:
|
# Aufruf (im venv):
|
||||||
# # alle Notes exportieren:
|
# # alle Notes exportieren (Prefix wird aus ENV COLLECTION_PREFIX gelesen):
|
||||||
# python3 -m scripts.export_markdown --prefix mindnet --out ./_export
|
# python3 -m scripts.export_markdown --out ./_export
|
||||||
|
#
|
||||||
|
# # Prefix explizit per ENV überschreiben:
|
||||||
|
# COLLECTION_PREFIX=mindnet python3 -m scripts.export_markdown --out ./_export
|
||||||
#
|
#
|
||||||
# # nur bestimmte Note-IDs exportieren:
|
# # nur bestimmte Note-IDs exportieren:
|
||||||
# python3 -m scripts.export_markdown --prefix mindnet --out ./_export \
|
# python3 -m scripts.export_markdown --out ./_export \
|
||||||
# --note-id 20250821-architektur-ki-trainerassistent-761cfe \
|
# --note-id 20250821-architektur-ki-trainerassistent-761cfe \
|
||||||
# --note-id 20250821-personal-mind-ki-projekt-7b0d79
|
# --note-id 20250821-personal-mind-ki-projekt-7b0d79
|
||||||
#
|
#
|
||||||
# Parameter:
|
# Parameter:
|
||||||
# --prefix : Collections-Präfix (Default: mindnet)
|
# --out : Zielverzeichnis (wird erstellt, Pflicht)
|
||||||
# --out : Zielverzeichnis (wird erstellt)
|
|
||||||
# --note-id : Kann mehrfach angegeben werden; dann nur diese Notes
|
# --note-id : Kann mehrfach angegeben werden; dann nur diese Notes
|
||||||
# --overwrite : Existierende Dateien überschreiben (sonst überspringen)
|
# --overwrite : Existierende Dateien überschreiben (sonst überspringen)
|
||||||
#
|
#
|
||||||
|
# Umgebung:
|
||||||
|
# QDRANT_URL (z. B. http://127.0.0.1:6333)
|
||||||
|
# QDRANT_API_KEY (optional)
|
||||||
|
# COLLECTION_PREFIX (Default in app/core/qdrant.py: "mindnet")
|
||||||
|
# VECTOR_DIM (Default in app/core/qdrant.py: 384)
|
||||||
|
#
|
||||||
# Voraussetzungen:
|
# Voraussetzungen:
|
||||||
# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
|
# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
|
||||||
# - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py
|
# - Qdrant läuft (oder URL/API-Key in ENV), siehe app/core/qdrant.py
|
||||||
#
|
#
|
||||||
# Änderungen:
|
# Änderungen:
|
||||||
|
# - 1.0.1: Nutzt QdrantConfig.from_env() ohne Parameter; liest Prefix aus ENV.
|
||||||
|
# Passt collection_names()-Nutzung (Tupel) korrekt an.
|
||||||
# - 1.0.0: Erster Release.
|
# - 1.0.0: Erster Release.
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
@ -39,7 +49,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from qdrant_client import QdrantClient
|
from qdrant_client import QdrantClient
|
||||||
from qdrant_client.http import models as rest
|
from qdrant_client.http import models as rest
|
||||||
|
|
@ -49,14 +59,12 @@ from app.core.qdrant import QdrantConfig, get_client, collection_names
|
||||||
|
|
||||||
def to_yaml_frontmatter(fm: Dict) -> str:
|
def to_yaml_frontmatter(fm: Dict) -> str:
|
||||||
"""Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil)."""
|
"""Serialisiert ein Python-Dict als YAML-Frontmatter (einfach, stabil)."""
|
||||||
# Nur bekannte Felder in definierter Reihenfolge
|
|
||||||
ordered_keys = [
|
ordered_keys = [
|
||||||
"id", "note_id", "title", "type", "status",
|
"id", "note_id", "title", "type", "status",
|
||||||
"created", "updated", "path", "tags",
|
"created", "updated", "path", "tags",
|
||||||
"area", "project", "source", "lang", "slug",
|
"area", "project", "source", "lang", "slug",
|
||||||
]
|
]
|
||||||
lines: List[str] = ["---"]
|
lines: List[str] = ["---"]
|
||||||
# normiere: id-Feld (falls nur note_id existiert)
|
|
||||||
m = dict(fm)
|
m = dict(fm)
|
||||||
if "id" not in m and "note_id" in m:
|
if "id" not in m and "note_id" in m:
|
||||||
m["id"] = m["note_id"]
|
m["id"] = m["note_id"]
|
||||||
|
|
@ -89,17 +97,13 @@ def choose_output_path(out_dir: Path, fm: Dict) -> Path:
|
||||||
|
|
||||||
|
|
||||||
def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]:
|
def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[List[str]]) -> List[Dict]:
|
||||||
"""scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts]."""
|
"""Scrollt alle Notes (optional gefiltert). Rückgabe: List[Payload-Dicts]."""
|
||||||
results: List[Dict] = []
|
results: List[Dict] = []
|
||||||
offset = None
|
offset = None
|
||||||
flt = None
|
flt = None
|
||||||
if only_ids:
|
if only_ids:
|
||||||
# Filter: note_id IN [...]
|
|
||||||
flt = rest.Filter(
|
flt = rest.Filter(
|
||||||
should=[
|
should=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)) for nid in only_ids]
|
||||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))
|
|
||||||
for nid in only_ids
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -123,9 +127,7 @@ def fetch_all_notes(client: QdrantClient, notes_col: str, only_ids: Optional[Lis
|
||||||
def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]:
|
def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -> List[Dict]:
|
||||||
res: List[Dict] = []
|
res: List[Dict] = []
|
||||||
offset = None
|
offset = None
|
||||||
flt = rest.Filter(
|
flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
|
|
||||||
)
|
|
||||||
while True:
|
while True:
|
||||||
pts, next_offset = client.scroll(
|
pts, next_offset = client.scroll(
|
||||||
collection_name=chunks_col,
|
collection_name=chunks_col,
|
||||||
|
|
@ -141,8 +143,6 @@ def fetch_chunks_for_note(client: QdrantClient, chunks_col: str, note_id: str) -
|
||||||
if next_offset is None:
|
if next_offset is None:
|
||||||
break
|
break
|
||||||
offset = next_offset
|
offset = next_offset
|
||||||
|
|
||||||
# sortiere nach seq, falls vorhanden
|
|
||||||
res.sort(key=lambda x: x.get("seq", 0))
|
res.sort(key=lambda x: x.get("seq", 0))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
@ -164,7 +164,6 @@ def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict]
|
||||||
|
|
||||||
frontmatter = to_yaml_frontmatter(note_payload)
|
frontmatter = to_yaml_frontmatter(note_payload)
|
||||||
body = assemble_body_from_chunks(chunks)
|
body = assemble_body_from_chunks(chunks)
|
||||||
|
|
||||||
content = f"{frontmatter}\n{body}"
|
content = f"{frontmatter}\n{body}"
|
||||||
out_path.write_text(content, encoding="utf-8")
|
out_path.write_text(content, encoding="utf-8")
|
||||||
return out_path
|
return out_path
|
||||||
|
|
@ -172,7 +171,6 @@ def write_note_as_markdown(out_dir: Path, note_payload: Dict, chunks: List[Dict]
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.")
|
ap = argparse.ArgumentParser(description="Exportiert Notes+Chunks aus Qdrant in Markdown-Dateien.")
|
||||||
ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet)")
|
|
||||||
ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien")
|
ap.add_argument("--out", required=True, help="Zielverzeichnis für exportierte .md-Dateien")
|
||||||
ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)")
|
ap.add_argument("--note-id", action="append", help="Spezifische Note-ID exportieren (mehrfach möglich)")
|
||||||
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
|
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
|
||||||
|
|
@ -181,21 +179,20 @@ def main():
|
||||||
out_dir = Path(args.out).resolve()
|
out_dir = Path(args.out).resolve()
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# korrekt
|
# Wichtig: Prefix & Co. kommen aus ENV via from_env()
|
||||||
cfg = QdrantConfig.from_env(prefix=args.prefix)
|
cfg = QdrantConfig.from_env()
|
||||||
client = get_client(cfg)
|
client = get_client(cfg)
|
||||||
cols = collection_names(args.prefix)
|
notes_col, chunks_col, _edges_col = collection_names(cfg.prefix)
|
||||||
|
|
||||||
|
notes = fetch_all_notes(client, notes_col, args.note_id)
|
||||||
notes = fetch_all_notes(client, cols["notes"], args.note_id)
|
|
||||||
if not notes:
|
if not notes:
|
||||||
print("Keine Notes in Qdrant gefunden (Filter zu streng?).")
|
print("Keine Notes in Qdrant gefunden (oder Filter zu streng).")
|
||||||
return
|
return
|
||||||
|
|
||||||
exported = []
|
exported = []
|
||||||
for np in notes:
|
for np in notes:
|
||||||
nid = np.get("note_id") or np.get("id")
|
nid = np.get("note_id") or np.get("id")
|
||||||
chunks = fetch_chunks_for_note(client, cols["chunks"], note_id=str(nid))
|
chunks = fetch_chunks_for_note(client, chunks_col, note_id=str(nid))
|
||||||
path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite)
|
path = write_note_as_markdown(out_dir, np, chunks, overwrite=args.overwrite)
|
||||||
exported.append({"note_id": nid, "path": str(path)})
|
exported.append({"note_id": nid, "path": str(path)})
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user