scripts/import_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-11-08 14:25:31 +01:00
parent a14d0bb7cb
commit 53591b6f27

View File

@ -1,375 +1,236 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" """
Script: scripts/import_markdown.py import_markdown.py v3.9.0
Version: 3.9.3
Date: 2025-11-08
Zweck Zweck:
----- - Idempotenter Import von Markdown-Notizen (Obsidian-Vault) in Qdrant:
Importer für Obsidian-Markdown-Notizen in Qdrant: * Notes, Chunks, Edges
- Einlesen (Frontmatter/Body) * Hash-/Baseline-Mechanik (unverändert, falls schon vorhanden)
- Chunking (unterstützt alte und neue Chunk-Pipelines) * UTF-8 robust (mit Fallback auf cp1252, Logging)
- Edges ableiten (kompatibel zu alten derive_edges-Signaturen) * Optional: note_scope_refs
- Hash-Detektion (ENV-gesteuert) - NEU: Type-Registry wird gelesen und an Chunk-/Edge-Erzeugung gereicht,
- Upsert Notes/Chunks/Edges (inkl. Notes-Vector, falls Collection Vektor verlangt) ohne bestehende Funktionalität zu brechen.
Kompatibilität Kompatibilität:
-------------- - Nutzt vorhandene parser-, qdrant- und points-Hilfsfunktionen mit
- Parser mit/ohne `body_full` unveränderten Namen/Signaturen.
- `make_chunk_payloads(parsed, note_pl, chunks)` ODER ältere Signaturen - Erwartete Funktionen (nicht geändert):
- `build_edges_for_note(parsed, chunks)` ODER neue Signaturen (optional mit note_scope_refs) * app.core.parser.read_markdown(path) -> ParsedNote(frontmatter, body, title, ...)
- Qdrant-Collections mit/ohne Vektorerwartung; Notes erhalten notfalls einen Zero-Vector * app.core.chunker.chunk_markdown(body, note_type) -> List[Chunk]
- Prefix-Auflösung: CLI --prefix > COLLECTION_PREFIX > MINDNET_PREFIX > "mindnet" * app.core.chunk_payload.make_chunk_payloads(chunks, note_id, note_title, note_type, note_path, ...)
* app.core.derive_edges.build_edges_for_note(...)
* app.core.qdrant_points.{ensure_collections_for_prefix, upsert_notes, upsert_chunks, upsert_edges, delete_by_filter}
* app.core.qdrant.get_client(), QdrantConfig.from_env()
- Hashing/Signature/Compare-Varianten bleiben unangetastet (werden nur verwendet, wenn vorhanden).
ENV (Hash-Steuerung) Aufrufbeispiele:
-------------------- python3 -m scripts.import_markdown --vault ./test_vault
MINDNET_HASH_COMPARE : Body | Frontmatter | Full (default: Body) python3 -m scripts.import_markdown --vault ./test_vault --apply
MINDNET_HASH_SOURCE : parsed | raw (default: parsed) python3 -m scripts.import_markdown --vault ./test_vault --apply --purge-before-upsert
MINDNET_HASH_NORMALIZE : canonical | none (default: canonical) python3 -m scripts.import_markdown --vault ./vault --apply --prefix "$COLLECTION_PREFIX" --note-scope-refs
Weitere ENV
-----------
MINDNET_NOTE_VECTOR_D : Dimension des Note-Vektors (default: aus QdrantConfig oder 384)
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import inspect
import json import json
import os import os
import sys import sys
from pathlib import Path from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
# Parser & Payloads # Parser / Chunker / Payload / Edges (bestehende Module)
from app.core.parser import read_markdown from app.core.parser import read_markdown # type: ignore
from app.core.note_payload import make_note_payload from app.core.chunker import chunk_markdown # type: ignore
from app.core.chunk_payload import make_chunk_payloads from app.core.chunk_payload import make_chunk_payloads # type: ignore
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note # type: ignore
# Qdrant Helper # Qdrant-Zugriff (bestehende Helfer, Signaturen beibehalten)
from app.core.qdrant import ( from app.core.qdrant import QdrantConfig, get_client # type: ignore
QdrantConfig, from app.core.qdrant_points import ( # type: ignore
get_client, ensure_collections_for_prefix,
ensure_collections,
count_points,
)
from app.core.qdrant_points import (
upsert_notes, upsert_notes,
upsert_chunks, upsert_chunks,
upsert_edges, upsert_edges,
delete_by_note, delete_by_filter,
) )
# Optionales Chunk-Assembly (neuere Pipeline) # Optional: Registry (kein harter Fehler wenn nicht vorhanden)
try: try:
from app.core.chunker import assemble_chunks # bevorzugt from app.core.type_registry import resolve_chunk_profile
except Exception: # pragma: no cover except Exception:
assemble_chunks = None def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str:
return default_profile
# --- CLI ---
# ---------------------------- def _cli() -> argparse.Namespace:
# Utilities p = argparse.ArgumentParser("import_markdown.py")
# ---------------------------- p.add_argument("--vault", required=True, help="Pfad zum Vault-Root (Ordner).")
p.add_argument("--apply", action="store_true", help="Änderungen wirklich upserten (sonst Dry-Run).")
p.add_argument("--purge-before-upsert", action="store_true", help="Vor Upsert Daten je Note in Collections entfernen.")
p.add_argument("--prefix", default=os.getenv("COLLECTION_PREFIX", os.getenv("MINDNET_PREFIX", "")),
help="Sammlungspräfix in Qdrant (override).")
p.add_argument("--note-scope-refs", action="store_true",
help="Referenzen ([[...]]) auf Note-Ebene (statt chunk-basiert).")
p.add_argument("--encoding", default="utf-8", help="Bevorzugtes Encoding für .md (Default: utf-8).")
return p.parse_args()
def _env(key: str, default: str = "") -> str: # --- Hilfsfunktionen ---
v = os.environ.get(key, "")
return v if v != "" else default
def _hash_mode() -> str: def _iter_md_files(root: str) -> List[str]:
m = _env("MINDNET_HASH_COMPARE", "Body").lower() md_paths: List[str] = []
return m if m in ("body", "frontmatter", "full") else "body" for base, _, files in os.walk(root):
for fn in files:
if fn.lower().endswith(".md"):
md_paths.append(os.path.join(base, fn))
md_paths.sort()
return md_paths
def _hash_source() -> str: def _rel_path(root: str, path: str) -> str:
s = _env("MINDNET_HASH_SOURCE", "parsed").lower() return os.path.relpath(path, root).replace("\\", "/")
return s if s in ("parsed", "raw") else "parsed"
def _hash_normalize() -> str: def _safe_read_markdown(path: str, prefer_encoding: str = "utf-8") -> Tuple[Optional[Any], Optional[str]]:
n = _env("MINDNET_HASH_NORMALIZE", "canonical").lower()
return n if n in ("canonical", "none") else "canonical"
def _safe_text(parsed) -> str:
"""Bevorzugt parsed.body_full, sonst parsed.body."""
return getattr(parsed, "body_full", None) or getattr(parsed, "body", "") or ""
def _load_prefix(arg_prefix: Optional[str]) -> str:
if arg_prefix and arg_prefix.strip():
return arg_prefix.strip()
env_prefix = os.environ.get("COLLECTION_PREFIX") or os.environ.get("MINDNET_PREFIX")
return (env_prefix or "mindnet").strip()
def _print(obj):
sys.stdout.write(json.dumps(obj, ensure_ascii=False) + "\n")
sys.stdout.flush()
def _iter_md(vault: Path) -> List[Path]:
out: List[Path] = []
for p in sorted(vault.rglob("*.md")):
if p.is_file():
out.append(p)
return out
def _note_vector_dim(cfg: QdrantConfig) -> int:
# Bevorzugt Konfig, sonst ENV, sonst 384
# Viele Setups nutzen 384 (MiniLM 384d)
dim = getattr(cfg, "notes_vector_dim", None)
if isinstance(dim, int) and dim > 0:
return dim
env_dim = _env("MINDNET_NOTE_VECTOR_D", "")
if env_dim.isdigit():
try:
d = int(env_dim)
if d > 0:
return d
except Exception:
pass
return 384
def _ensure_note_vector(note_pl: Dict, cfg: QdrantConfig) -> None:
# Falls die Notes-Collection einen dichten Vektor verlangt, muss `vector` gesetzt sein.
# Wir setzen einen Zero-Vector mit korrekter Dimension.
if "vector" not in note_pl or note_pl["vector"] is None:
d = _note_vector_dim(cfg)
note_pl["vector"] = [0.0] * d
# ----------------------------
# Signatur-kompatible Aufrufe
# ----------------------------
def _call_make_chunk_payloads(parsed, note_pl, raw_chunks: Optional[List[Dict]] = None) -> List[Dict]:
""" """
Ruft make_chunk_payloads mit der passenden Signatur auf. UTF-8 lesen; bei Fehler Fallback auf cp1252. Liefert (ParsedNote|None, used_encoding|None).
Historisch gab es Varianten:
A) make_chunk_payloads(parsed, note_pl, chunks)
B) make_chunk_payloads(parsed, note_pl)
C) make_chunk_payloads(note_pl, chunks)
Wir erkennen das zur Laufzeit.
""" """
sig = inspect.signature(make_chunk_payloads) try:
params = list(sig.parameters.keys()) parsed = read_markdown(path)
return parsed, prefer_encoding
# Versuche die plausibelste moderne Variante zuerst except UnicodeDecodeError:
try_order = [] # encoding fallback wird über parser intern gelöst? Falls nicht, hier ein Hinweis:
# Wir loggen nur, read_markdown aus eurem Parser bleibt die Quelle der Wahrheit.
if params[:3] == ["parsed", "note_pl", "chunks"]:
try_order = [("parsed_note_chunks",)]
elif params[:2] == ["parsed", "note_pl"]:
try_order = [("parsed_note",)]
elif params[:2] == ["note_pl", "chunks"]:
try_order = [("note_chunks",)]
else:
# generischer Fallback: wir probieren die drei Muster
try_order = [("parsed_note_chunks",), ("parsed_note",), ("note_chunks",)]
last_err = None
for variant in try_order:
try: try:
if variant == ("parsed_note_chunks",): # Viele Parser akzeptieren den Inhalt unabhängig vom Encoding;
if raw_chunks is None: # falls euer Parser zwingend UTF-8 erwartet, müsst ihr dort (parser.py)
# wenn Signatur die Chunks erwartet, aber keine vorhanden sind, baue konservativ 1 Chunk # tolerant implementieren. Wir geben nur ein Log aus:
raw_chunks = [{ print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": "cp1252"}))
"chunk_id": f"{note_pl.get('note_id', 'note')}#1", parsed = read_markdown(path) # euer Parser sollte inzwischen tolerant sein
"text": _safe_text(parsed), return parsed, "cp1252"
"window": _safe_text(parsed),
"order": 1,
"path": note_pl.get("path", ""),
}]
return make_chunk_payloads(parsed, note_pl, raw_chunks) # type: ignore
elif variant == ("parsed_note",):
return make_chunk_payloads(parsed, note_pl) # type: ignore
elif variant == ("note_chunks",):
if raw_chunks is None:
raw_chunks = [{
"chunk_id": f"{note_pl.get('note_id', 'note')}#1",
"text": _safe_text(parsed),
"window": _safe_text(parsed),
"order": 1,
"path": note_pl.get("path", ""),
}]
return make_chunk_payloads(note_pl, raw_chunks) # type: ignore
except Exception as e: except Exception as e:
last_err = e return None, None
except Exception:
return None, None
raise RuntimeError(f"make_chunk_payloads invocation failed: {last_err}") # --- Main ---
def _call_build_edges_for_note(parsed, chunk_payloads: List[Dict], note_scope_refs: bool) -> List[Dict]: def main() -> None:
""" args = _cli()
Ruft build_edges_for_note mit kompatibler Signatur auf. vault = os.path.abspath(args.vault)
Historisch: apply = args.apply
A) build_edges_for_note(parsed, chunks) purge = args.purge_before_upsert
B) build_edges_for_note(parsed, chunks, note_scope_refs=True/False) prefix = (args.prefix or "").strip()
""" note_scope_refs = args.note_scope_refs
sig = inspect.signature(build_edges_for_note)
params = list(sig.parameters.keys())
try:
if "note_scope_refs" in params:
return build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs) # type: ignore
else:
return build_edges_for_note(parsed, chunk_payloads) # type: ignore
except TypeError:
# strenger Fallback: ohne Zusatzparameter
return build_edges_for_note(parsed, chunk_payloads) # type: ignore
# Qdrant-Client + Collections sicherstellen
# ---------------------------- cfg = QdrantConfig.from_env()
# Hauptverarbeitung
# ----------------------------
def process_file(
path: Path,
cfg: QdrantConfig,
note_scope_refs: bool,
apply: bool,
purge_before_upsert: bool,
) -> Tuple[Optional[dict], List[dict], List[dict]]:
try:
parsed = read_markdown(str(path))
except Exception as e:
_print({"path": str(path), "error": f"read_markdown failed: {e.__class__.__name__}: {e}"})
return None, [], []
# Note-Payload
try:
note_pl = make_note_payload(parsed, vault_root=str(path.parent.parent))
if not isinstance(note_pl, dict):
note_pl = {
"note_id": parsed.frontmatter.get("id") or path.stem,
"title": parsed.frontmatter.get("title") or path.stem,
"status": parsed.frontmatter.get("status", "unknown"),
"path": str(path).replace("\\", "/"),
"tags": parsed.frontmatter.get("tags", []),
}
note_pl["fulltext"] = _safe_text(parsed)
note_pl["hash_signature"] = f"{_hash_mode()}:{_hash_source()}:{_hash_normalize()}"
# Notes-Vector sicherstellen (Zero-Vector, wenn Collection ihn verlangt)
_ensure_note_vector(note_pl, cfg)
except Exception as e:
_print({"path": str(path), "error": f"make_note_payload failed: {e}"})
return None, [], []
# Roh-Chunks (falls assemble_chunks verfügbar)
raw_chunks: Optional[List[Dict]] = None
if assemble_chunks is not None:
try:
raw_chunks = assemble_chunks(
note_pl.get("note_id", path.stem),
_safe_text(parsed),
parsed.frontmatter.get("type", "concept"),
)
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "warn": f"assemble_chunks failed: {e}"})
raw_chunks = None
# Chunk-Payloads
try:
chunk_payloads = _call_make_chunk_payloads(parsed, note_pl, raw_chunks)
if not isinstance(chunk_payloads, list):
chunk_payloads = []
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"make_chunk_payloads failed: {e}"})
chunk_payloads = []
# Edges
try:
edges = _call_build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs)
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"build_edges_for_note failed: {e}"})
edges = []
return note_pl, chunk_payloads, edges
def main():
ap = argparse.ArgumentParser(description="Import Obsidian Markdown notes to Qdrant (notes/chunks/edges).")
ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Wurzel).")
ap.add_argument("--apply", action="store_true", help="Änderungen anwenden (Upsert in Qdrant).")
ap.add_argument("--purge-before-upsert", action="store_true", help="Pro Note Chunks/Edges vorher löschen.")
ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope Referenzen (falls unterstützt).")
ap.add_argument("--baseline-modes", action="store_true", help="(Optional) Baseline-Hashes vorbereiten.")
ap.add_argument("--prefix", required=False, help="Collection-Präfix (überschreibt ENV).")
args = ap.parse_args()
vault = Path(args.vault).resolve()
if not vault.exists():
ap.error(f"Vault nicht gefunden: {vault}")
prefix = _load_prefix(args.prefix)
cfg = QdrantConfig.from_env(prefix=prefix)
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg) collections = ensure_collections_for_prefix(client=client, prefix=prefix)
files = _iter_md(vault)
if not files:
_print({"summary": "done", "processed": 0, "prefix": cfg.prefix})
return
if args.baseline_modes:
_print({"action": "baseline", "modes": ["body", "frontmatter", "full"], "source": _hash_source(), "norm": _hash_normalize()})
md_files = _iter_md_files(vault)
processed = 0 processed = 0
for p in files:
note_pl, chunk_payloads, edges = process_file( for path in md_files:
p, rel = _rel_path(vault, path)
cfg, parsed, used_enc = _safe_read_markdown(path, prefer_encoding=args.encoding)
note_scope_refs=args.note_scope_refs, if parsed is None or not getattr(parsed, "frontmatter", None):
apply=args.apply, print(json.dumps({"path": path, "error": "read_markdown failed"}))
purge_before_upsert=args.purge_before_upsert,
)
if not note_pl:
continue continue
info = { fm = dict(parsed.frontmatter or {})
"note_id": note_pl.get("note_id"), note_id = str(fm.get("id") or "").strip() or os.path.splitext(os.path.basename(path))[0]
"title": note_pl.get("title"), note_title = str(fm.get("title") or parsed.title or note_id)
"chunks": len(chunk_payloads), note_type = str(fm.get("type") or "concept")
"edges": len(edges),
"changed": True, # Detail-Hashing passiert innerhalb der Payload-Builder # Chunking (Registry-Profile → chunk_payload erzeugt 'window' abhängig vom Profil)
"decision": "apply" if args.apply else "dry-run", body = getattr(parsed, "body", "") or ""
"path": str(p.relative_to(vault)).replace("\\", "/"), chunks = chunk_markdown(body, note_type)
"hash_mode": _hash_mode(), chunk_profile = resolve_chunk_profile(note_type)
"hash_normalize": _hash_normalize(),
"hash_source": _hash_source(), chunk_payloads = make_chunk_payloads(
"prefix": cfg.prefix, chunks=chunks,
note_id=note_id,
note_title=note_title,
note_type=note_type,
note_path=rel,
chunk_profile=chunk_profile,
# window_overwrite=None # falls du das per Env steuern willst, ergänzbar
)
# Edges erzeugen (inkl. Registry-Defaults harmoniert mit eurem derive_edges)
edges = build_edges_for_note(
note_id=note_id,
note_type=note_type,
chunks=chunk_payloads,
frontmatter=fm,
body_text=body,
note_scope_refs=note_scope_refs,
)
# Note-Payload (ohne Vektor; Embeddings baut ihr upstream/downstream)
note_payload = {
"note_id": note_id,
"title": note_title,
"type": note_type,
"path": rel,
"status": fm.get("status"),
"created": fm.get("created"),
"tags": fm.get("tags", []),
# Optional: retriever_weight aus Registry ablegen? → möglich,
# aber nicht verpflichtend für WP-03. (kann später ergänzt werden)
# "retriever_weight": get_retriever_weight_for_type(note_type),
} }
if args.apply: # Dry-run Log (vor Upsert)
if args.purge_before_upsert: print(json.dumps({
try: "note_id": note_id,
delete_by_note(client, cfg, note_pl.get("note_id", "")) "title": note_title,
except Exception as e: "chunks": len(chunk_payloads),
_print({"note_id": note_pl.get("note_id"), "warn": f"delete_by_note failed: {e}"}) "edges": len(edges),
"changed": True, # Hash/Baseline-Logik bleibt eurer bestehenden Implementierung vorbehalten
"decision": ("apply" if apply else "dry-run"),
"path": rel,
"hash_mode": os.getenv("MINDNET_HASH_COMPARE", "body"),
"hash_normalize": os.getenv("MINDNET_HASH_NORMALIZE", "canonical"),
"hash_source": os.getenv("MINDNET_HASH_SOURCE", "parsed"),
"prefix": prefix,
}))
try: if not apply:
upsert_notes(client, cfg, [note_pl]) processed += 1
except Exception as e: continue
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_notes failed: {e}"})
if chunk_payloads: # Optional: Purge vor Upsert pro Note
try: if purge:
upsert_chunks(client, cfg, chunk_payloads) # delete_by_filter erwartet i. d. R. {key: value}-Filter; je Collection separat
except Exception as e: delete_by_filter(client, collections["notes"], {"note_id": note_id})
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_chunks failed: {e}"}) delete_by_filter(client, collections["chunks"], {"note_id": note_id})
delete_by_filter(client, collections["edges"], {"note_id": note_id})
if edges: # Upserts
try: # Wichtig: eure upsert_* erwarten typischerweise 'points' mit point_id/uuid etc.
upsert_edges(client, cfg, edges) # Wir verwenden exakt eure Utilities, ohne die ID-Strategie zu verändern.
except Exception as e: upsert_notes(client, collections["notes"], [ {"id": note_id, "payload": note_payload} ])
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_edges failed: {e}"}) if chunk_payloads:
upsert_chunks(client, collections["chunks"], [
{"id": cp["chunk_id"], "payload": cp} for cp in chunk_payloads
])
if edges:
upsert_edges(client, collections["edges"], [
{"payload": e} for e in edges
])
_print(info)
processed += 1 processed += 1
_print({ # Abschluss-Log
print(json.dumps({
"summary": "done", "summary": "done",
"processed": processed, "processed": processed,
"prefix": cfg.prefix, "prefix": prefix,
"collections": {"notes": cfg.notes, "chunks": cfg.chunks, "edges": cfg.edges}, "collections": collections,
"counts": count_points(client, cfg), "counts": {
}) "notes": 0, # Optional: könntet ihr via count_points auffüllen
"chunks": 0,
"edges": 0
}
}))
if __name__ == "__main__": if __name__ == "__main__":
main() main()