mindnet/scripts/import_markdown.py
Lars 2f9ce824a0
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
scripts/import_markdown.py aktualisiert
2025-11-08 12:48:31 +01:00

376 lines
13 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: scripts/import_markdown.py
Version: 3.9.3
Date: 2025-11-08
Zweck
-----
Importer für Obsidian-Markdown-Notizen in Qdrant:
- Einlesen (Frontmatter/Body)
- Chunking (unterstützt alte und neue Chunk-Pipelines)
- Edges ableiten (kompatibel zu alten derive_edges-Signaturen)
- Hash-Detektion (ENV-gesteuert)
- Upsert Notes/Chunks/Edges (inkl. Notes-Vector, falls Collection Vektor verlangt)
Kompatibilität
--------------
- Parser mit/ohne `body_full`
- `make_chunk_payloads(parsed, note_pl, chunks)` ODER ältere Signaturen
- `build_edges_for_note(parsed, chunks)` ODER neue Signaturen (optional mit note_scope_refs)
- Qdrant-Collections mit/ohne Vektorerwartung; Notes erhalten notfalls einen Zero-Vector
- Prefix-Auflösung: CLI --prefix > COLLECTION_PREFIX > MINDNET_PREFIX > "mindnet"
ENV (Hash-Steuerung)
--------------------
MINDNET_HASH_COMPARE : Body | Frontmatter | Full (default: Body)
MINDNET_HASH_SOURCE : parsed | raw (default: parsed)
MINDNET_HASH_NORMALIZE : canonical | none (default: canonical)
Weitere ENV
-----------
MINDNET_NOTE_VECTOR_D : Dimension des Note-Vektors (default: aus QdrantConfig oder 384)
"""
from __future__ import annotations
import argparse
import inspect
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional, Tuple
# Parser & Payloads
from app.core.parser import read_markdown
from app.core.note_payload import make_note_payload
from app.core.chunk_payload import make_chunk_payloads
from app.core.derive_edges import build_edges_for_note
# Qdrant Helper
from app.core.qdrant import (
QdrantConfig,
get_client,
ensure_collections,
count_points,
)
from app.core.qdrant_points import (
upsert_notes,
upsert_chunks,
upsert_edges,
delete_by_note,
)
# Optionales Chunk-Assembly (neuere Pipeline)
try:
from app.core.chunker import assemble_chunks # bevorzugt
except Exception: # pragma: no cover
assemble_chunks = None
# ----------------------------
# Utilities
# ----------------------------
def _env(key: str, default: str = "") -> str:
v = os.environ.get(key, "")
return v if v != "" else default
def _hash_mode() -> str:
m = _env("MINDNET_HASH_COMPARE", "Body").lower()
return m if m in ("body", "frontmatter", "full") else "body"
def _hash_source() -> str:
s = _env("MINDNET_HASH_SOURCE", "parsed").lower()
return s if s in ("parsed", "raw") else "parsed"
def _hash_normalize() -> str:
n = _env("MINDNET_HASH_NORMALIZE", "canonical").lower()
return n if n in ("canonical", "none") else "canonical"
def _safe_text(parsed) -> str:
"""Bevorzugt parsed.body_full, sonst parsed.body."""
return getattr(parsed, "body_full", None) or getattr(parsed, "body", "") or ""
def _load_prefix(arg_prefix: Optional[str]) -> str:
if arg_prefix and arg_prefix.strip():
return arg_prefix.strip()
env_prefix = os.environ.get("COLLECTION_PREFIX") or os.environ.get("MINDNET_PREFIX")
return (env_prefix or "mindnet").strip()
def _print(obj):
sys.stdout.write(json.dumps(obj, ensure_ascii=False) + "\n")
sys.stdout.flush()
def _iter_md(vault: Path) -> List[Path]:
out: List[Path] = []
for p in sorted(vault.rglob("*.md")):
if p.is_file():
out.append(p)
return out
def _note_vector_dim(cfg: QdrantConfig) -> int:
# Bevorzugt Konfig, sonst ENV, sonst 384
# Viele Setups nutzen 384 (MiniLM 384d)
dim = getattr(cfg, "notes_vector_dim", None)
if isinstance(dim, int) and dim > 0:
return dim
env_dim = _env("MINDNET_NOTE_VECTOR_D", "")
if env_dim.isdigit():
try:
d = int(env_dim)
if d > 0:
return d
except Exception:
pass
return 384
def _ensure_note_vector(note_pl: Dict, cfg: QdrantConfig) -> None:
# Falls die Notes-Collection einen dichten Vektor verlangt, muss `vector` gesetzt sein.
# Wir setzen einen Zero-Vector mit korrekter Dimension.
if "vector" not in note_pl or note_pl["vector"] is None:
d = _note_vector_dim(cfg)
note_pl["vector"] = [0.0] * d
# ----------------------------
# Signatur-kompatible Aufrufe
# ----------------------------
def _call_make_chunk_payloads(parsed, note_pl, raw_chunks: Optional[List[Dict]] = None) -> List[Dict]:
"""
Ruft make_chunk_payloads mit der passenden Signatur auf.
Historisch gab es Varianten:
A) make_chunk_payloads(parsed, note_pl, chunks)
B) make_chunk_payloads(parsed, note_pl)
C) make_chunk_payloads(note_pl, chunks)
Wir erkennen das zur Laufzeit.
"""
sig = inspect.signature(make_chunk_payloads)
params = list(sig.parameters.keys())
# Versuche die plausibelste moderne Variante zuerst
try_order = []
if params[:3] == ["parsed", "note_pl", "chunks"]:
try_order = [("parsed_note_chunks",)]
elif params[:2] == ["parsed", "note_pl"]:
try_order = [("parsed_note",)]
elif params[:2] == ["note_pl", "chunks"]:
try_order = [("note_chunks",)]
else:
# generischer Fallback: wir probieren die drei Muster
try_order = [("parsed_note_chunks",), ("parsed_note",), ("note_chunks",)]
last_err = None
for variant in try_order:
try:
if variant == ("parsed_note_chunks",):
if raw_chunks is None:
# wenn Signatur die Chunks erwartet, aber keine vorhanden sind, baue konservativ 1 Chunk
raw_chunks = [{
"chunk_id": f"{note_pl.get('note_id', 'note')}#1",
"text": _safe_text(parsed),
"window": _safe_text(parsed),
"order": 1,
"path": note_pl.get("path", ""),
}]
return make_chunk_payloads(parsed, note_pl, raw_chunks) # type: ignore
elif variant == ("parsed_note",):
return make_chunk_payloads(parsed, note_pl) # type: ignore
elif variant == ("note_chunks",):
if raw_chunks is None:
raw_chunks = [{
"chunk_id": f"{note_pl.get('note_id', 'note')}#1",
"text": _safe_text(parsed),
"window": _safe_text(parsed),
"order": 1,
"path": note_pl.get("path", ""),
}]
return make_chunk_payloads(note_pl, raw_chunks) # type: ignore
except Exception as e:
last_err = e
raise RuntimeError(f"make_chunk_payloads invocation failed: {last_err}")
def _call_build_edges_for_note(parsed, chunk_payloads: List[Dict], note_scope_refs: bool) -> List[Dict]:
"""
Ruft build_edges_for_note mit kompatibler Signatur auf.
Historisch:
A) build_edges_for_note(parsed, chunks)
B) build_edges_for_note(parsed, chunks, note_scope_refs=True/False)
"""
sig = inspect.signature(build_edges_for_note)
params = list(sig.parameters.keys())
try:
if "note_scope_refs" in params:
return build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs) # type: ignore
else:
return build_edges_for_note(parsed, chunk_payloads) # type: ignore
except TypeError:
# strenger Fallback: ohne Zusatzparameter
return build_edges_for_note(parsed, chunk_payloads) # type: ignore
# ----------------------------
# Hauptverarbeitung
# ----------------------------
def process_file(
path: Path,
cfg: QdrantConfig,
note_scope_refs: bool,
apply: bool,
purge_before_upsert: bool,
) -> Tuple[Optional[dict], List[dict], List[dict]]:
try:
parsed = read_markdown(str(path))
except Exception as e:
_print({"path": str(path), "error": f"read_markdown failed: {e.__class__.__name__}: {e}"})
return None, [], []
# Note-Payload
try:
note_pl = make_note_payload(parsed, vault_root=str(path.parent.parent))
if not isinstance(note_pl, dict):
note_pl = {
"note_id": parsed.frontmatter.get("id") or path.stem,
"title": parsed.frontmatter.get("title") or path.stem,
"status": parsed.frontmatter.get("status", "unknown"),
"path": str(path).replace("\\", "/"),
"tags": parsed.frontmatter.get("tags", []),
}
note_pl["fulltext"] = _safe_text(parsed)
note_pl["hash_signature"] = f"{_hash_mode()}:{_hash_source()}:{_hash_normalize()}"
# Notes-Vector sicherstellen (Zero-Vector, wenn Collection ihn verlangt)
_ensure_note_vector(note_pl, cfg)
except Exception as e:
_print({"path": str(path), "error": f"make_note_payload failed: {e}"})
return None, [], []
# Roh-Chunks (falls assemble_chunks verfügbar)
raw_chunks: Optional[List[Dict]] = None
if assemble_chunks is not None:
try:
raw_chunks = assemble_chunks(
note_pl.get("note_id", path.stem),
_safe_text(parsed),
parsed.frontmatter.get("type", "concept"),
)
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "warn": f"assemble_chunks failed: {e}"})
raw_chunks = None
# Chunk-Payloads
try:
chunk_payloads = _call_make_chunk_payloads(parsed, note_pl, raw_chunks)
if not isinstance(chunk_payloads, list):
chunk_payloads = []
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"make_chunk_payloads failed: {e}"})
chunk_payloads = []
# Edges
try:
edges = _call_build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs)
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"build_edges_for_note failed: {e}"})
edges = []
return note_pl, chunk_payloads, edges
def main():
ap = argparse.ArgumentParser(description="Import Obsidian Markdown notes to Qdrant (notes/chunks/edges).")
ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Wurzel).")
ap.add_argument("--apply", action="store_true", help="Änderungen anwenden (Upsert in Qdrant).")
ap.add_argument("--purge-before-upsert", action="store_true", help="Pro Note Chunks/Edges vorher löschen.")
ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope Referenzen (falls unterstützt).")
ap.add_argument("--baseline-modes", action="store_true", help="(Optional) Baseline-Hashes vorbereiten.")
ap.add_argument("--prefix", required=False, help="Collection-Präfix (überschreibt ENV).")
args = ap.parse_args()
vault = Path(args.vault).resolve()
if not vault.exists():
ap.error(f"Vault nicht gefunden: {vault}")
prefix = _load_prefix(args.prefix)
cfg = QdrantConfig.from_env(prefix=prefix)
client = get_client(cfg)
ensure_collections(client, cfg)
files = _iter_md(vault)
if not files:
_print({"summary": "done", "processed": 0, "prefix": cfg.prefix})
return
if args.baseline_modes:
_print({"action": "baseline", "modes": ["body", "frontmatter", "full"], "source": _hash_source(), "norm": _hash_normalize()})
processed = 0
for p in files:
note_pl, chunk_payloads, edges = process_file(
p,
cfg,
note_scope_refs=args.note_scope_refs,
apply=args.apply,
purge_before_upsert=args.purge_before_upsert,
)
if not note_pl:
continue
info = {
"note_id": note_pl.get("note_id"),
"title": note_pl.get("title"),
"chunks": len(chunk_payloads),
"edges": len(edges),
"changed": True, # Detail-Hashing passiert innerhalb der Payload-Builder
"decision": "apply" if args.apply else "dry-run",
"path": str(p.relative_to(vault)).replace("\\", "/"),
"hash_mode": _hash_mode(),
"hash_normalize": _hash_normalize(),
"hash_source": _hash_source(),
"prefix": cfg.prefix,
}
if args.apply:
if args.purge_before_upsert:
try:
delete_by_note(client, cfg, note_pl.get("note_id", ""))
except Exception as e:
_print({"note_id": note_pl.get("note_id"), "warn": f"delete_by_note failed: {e}"})
try:
upsert_notes(client, cfg, [note_pl])
except Exception as e:
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_notes failed: {e}"})
if chunk_payloads:
try:
upsert_chunks(client, cfg, chunk_payloads)
except Exception as e:
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_chunks failed: {e}"})
if edges:
try:
upsert_edges(client, cfg, edges)
except Exception as e:
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_edges failed: {e}"})
_print(info)
processed += 1
_print({
"summary": "done",
"processed": processed,
"prefix": cfg.prefix,
"collections": {"notes": cfg.notes, "chunks": cfg.chunks, "edges": cfg.edges},
"counts": count_points(client, cfg),
})
if __name__ == "__main__":
main()