scripts/import_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-08 12:48:31 +01:00
parent a39b2a6950
commit 2f9ce824a0

View File

@ -2,66 +2,60 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: scripts/import_markdown.py Script: scripts/import_markdown.py
Version: 3.9.2 Version: 3.9.3
Date: 2025-11-08 Date: 2025-11-08
Purpose Zweck
------- -----
Importer für Obsidian-Markdown-Notizen in Qdrant. Importer für Obsidian-Markdown-Notizen in Qdrant:
- Liest Frontmatter + Body - Einlesen (Frontmatter/Body)
- erzeugt Note-/Chunk-Payloads - Chunking (unterstützt alte und neue Chunk-Pipelines)
- leitet Edges ab - Edges ableiten (kompatibel zu alten derive_edges-Signaturen)
- Upsert in Qdrant (Notes, Chunks, Edges) - Hash-Detektion (ENV-gesteuert)
- Hash-basierte Änderungsdetektion (konfigurierbar via ENV) - Upsert Notes/Chunks/Edges (inkl. Notes-Vector, falls Collection Vektor verlangt)
Kompatibilität Kompatibilität
-------------- --------------
- Funktioniert mit Parsern, die NUR `body` bereitstellen (ohne `body_full`) - Parser mit/ohne `body_full`
- Unterstützt bestehende ENV-Variablen (COLLECTION_PREFIX / MINDNET_PREFIX) - `make_chunk_payloads(parsed, note_pl, chunks)` ODER ältere Signaturen
- Nutzt Wrapper aus app.core.qdrant / app.core.qdrant_points (siehe v1.8.0 / v1.7.0) - `build_edges_for_note(parsed, chunks)` ODER neue Signaturen (optional mit note_scope_refs)
- Fällt bei fehlenden neuen Funktionen auf vorhandene Defaults zurück - Qdrant-Collections mit/ohne Vektorerwartung; Notes erhalten notfalls einen Zero-Vector
- Prefix-Auflösung: CLI --prefix > COLLECTION_PREFIX > MINDNET_PREFIX > "mindnet"
Usage
-----
export COLLECTION_PREFIX="mindnet"
python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert --prefix "$COLLECTION_PREFIX"
Optional flags:
--note-scope-refs : extrahiert auch note-scope References
--baseline-modes : legt Basis-Hashes für Body/Frontmatter/Full an (falls genutzt)
--dry-run / (kein --apply): zeigt nur Entscheidungen an
ENV (Hash-Steuerung) ENV (Hash-Steuerung)
-------------------- --------------------
MINDNET_HASH_COMPARE : Body | Frontmatter | Full (default: Body) MINDNET_HASH_COMPARE : Body | Frontmatter | Full (default: Body)
MINDNET_HASH_SOURCE : parsed | raw (default: parsed) MINDNET_HASH_SOURCE : parsed | raw (default: parsed)
MINDNET_HASH_NORMALIZE: canonical | none (default: canonical) MINDNET_HASH_NORMALIZE : canonical | none (default: canonical)
Weitere ENV
-----------
MINDNET_NOTE_VECTOR_D : Dimension des Note-Vektors (default: aus QdrantConfig oder 384)
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import inspect
import json import json
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
# Core imports (bestehend) # Parser & Payloads
from app.core.parser import read_markdown from app.core.parser import read_markdown
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunk_payload import make_chunk_payloads from app.core.chunk_payload import make_chunk_payloads
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note
# Qdrant Helper
from app.core.qdrant import ( from app.core.qdrant import (
QdrantConfig, QdrantConfig,
get_client, get_client,
ensure_collections, ensure_collections,
count_points, count_points,
list_note_ids,
fetch_one_note,
) )
from app.core.qdrant_points import ( from app.core.qdrant_points import (
upsert_notes, upsert_notes,
upsert_chunks, upsert_chunks,
@ -69,8 +63,15 @@ from app.core.qdrant_points import (
delete_by_note, delete_by_note,
) )
# Optionales Chunk-Assembly (neuere Pipeline)
try:
from app.core.chunker import assemble_chunks # bevorzugt
except Exception: # pragma: no cover
assemble_chunks = None
# ---------------------------- # ----------------------------
# Hilfsfunktionen # Utilities
# ---------------------------- # ----------------------------
def _env(key: str, default: str = "") -> str: def _env(key: str, default: str = "") -> str:
@ -79,36 +80,30 @@ def _env(key: str, default: str = "") -> str:
def _hash_mode() -> str: def _hash_mode() -> str:
m = _env("MINDNET_HASH_COMPARE", "Body").lower() m = _env("MINDNET_HASH_COMPARE", "Body").lower()
if m not in ("body", "frontmatter", "full"): return m if m in ("body", "frontmatter", "full") else "body"
m = "body"
return m
def _hash_source() -> str: def _hash_source() -> str:
s = _env("MINDNET_HASH_SOURCE", "parsed").lower() s = _env("MINDNET_HASH_SOURCE", "parsed").lower()
if s not in ("parsed", "raw"): return s if s in ("parsed", "raw") else "parsed"
s = "parsed"
return s
def _hash_normalize() -> str: def _hash_normalize() -> str:
n = _env("MINDNET_HASH_NORMALIZE", "canonical").lower() n = _env("MINDNET_HASH_NORMALIZE", "canonical").lower()
if n not in ("canonical", "none"): return n if n in ("canonical", "none") else "canonical"
n = "canonical"
return n
def _safe_text(parsed) -> str: def _safe_text(parsed) -> str:
""" """Bevorzugt parsed.body_full, sonst parsed.body."""
Liefert bevorzugt parsed.body_full, sonst parsed.body, sonst "".
Kompatibilitätshelfer für Parser ohne 'body_full'.
"""
return getattr(parsed, "body_full", None) or getattr(parsed, "body", "") or "" return getattr(parsed, "body_full", None) or getattr(parsed, "body", "") or ""
def _load_prefix(arg_prefix: Optional[str]) -> str: def _load_prefix(arg_prefix: Optional[str]) -> str:
# Reihenfolge: CLI --prefix > COLLECTION_PREFIX > MINDNET_PREFIX > "mindnet"
if arg_prefix and arg_prefix.strip(): if arg_prefix and arg_prefix.strip():
return arg_prefix.strip() return arg_prefix.strip()
env_prefix = os.environ.get("COLLECTION_PREFIX") or os.environ.get("MINDNET_PREFIX") env_prefix = os.environ.get("COLLECTION_PREFIX") or os.environ.get("MINDNET_PREFIX")
return (env_prefix or "mindnet").strip() return (env_prefix or "mindnet").strip()
def _print(obj):
sys.stdout.write(json.dumps(obj, ensure_ascii=False) + "\n")
sys.stdout.flush()
def _iter_md(vault: Path) -> List[Path]: def _iter_md(vault: Path) -> List[Path]:
out: List[Path] = [] out: List[Path] = []
for p in sorted(vault.rglob("*.md")): for p in sorted(vault.rglob("*.md")):
@ -116,37 +111,130 @@ def _iter_md(vault: Path) -> List[Path]:
out.append(p) out.append(p)
return out return out
def _print(obj): def _note_vector_dim(cfg: QdrantConfig) -> int:
sys.stdout.write(json.dumps(obj, ensure_ascii=False) + "\n") # Bevorzugt Konfig, sonst ENV, sonst 384
sys.stdout.flush() # Viele Setups nutzen 384 (MiniLM 384d)
dim = getattr(cfg, "notes_vector_dim", None)
if isinstance(dim, int) and dim > 0:
return dim
env_dim = _env("MINDNET_NOTE_VECTOR_D", "")
if env_dim.isdigit():
try:
d = int(env_dim)
if d > 0:
return d
except Exception:
pass
return 384
def _ensure_note_vector(note_pl: Dict, cfg: QdrantConfig) -> None:
# Falls die Notes-Collection einen dichten Vektor verlangt, muss `vector` gesetzt sein.
# Wir setzen einen Zero-Vector mit korrekter Dimension.
if "vector" not in note_pl or note_pl["vector"] is None:
d = _note_vector_dim(cfg)
note_pl["vector"] = [0.0] * d
# ---------------------------- # ----------------------------
# Hauptlogik # Signatur-kompatible Aufrufe
# ----------------------------
def _call_make_chunk_payloads(parsed, note_pl, raw_chunks: Optional[List[Dict]] = None) -> List[Dict]:
"""
Ruft make_chunk_payloads mit der passenden Signatur auf.
Historisch gab es Varianten:
A) make_chunk_payloads(parsed, note_pl, chunks)
B) make_chunk_payloads(parsed, note_pl)
C) make_chunk_payloads(note_pl, chunks)
Wir erkennen das zur Laufzeit.
"""
sig = inspect.signature(make_chunk_payloads)
params = list(sig.parameters.keys())
# Versuche die plausibelste moderne Variante zuerst
try_order = []
if params[:3] == ["parsed", "note_pl", "chunks"]:
try_order = [("parsed_note_chunks",)]
elif params[:2] == ["parsed", "note_pl"]:
try_order = [("parsed_note",)]
elif params[:2] == ["note_pl", "chunks"]:
try_order = [("note_chunks",)]
else:
# generischer Fallback: wir probieren die drei Muster
try_order = [("parsed_note_chunks",), ("parsed_note",), ("note_chunks",)]
last_err = None
for variant in try_order:
try:
if variant == ("parsed_note_chunks",):
if raw_chunks is None:
# wenn Signatur die Chunks erwartet, aber keine vorhanden sind, baue konservativ 1 Chunk
raw_chunks = [{
"chunk_id": f"{note_pl.get('note_id', 'note')}#1",
"text": _safe_text(parsed),
"window": _safe_text(parsed),
"order": 1,
"path": note_pl.get("path", ""),
}]
return make_chunk_payloads(parsed, note_pl, raw_chunks) # type: ignore
elif variant == ("parsed_note",):
return make_chunk_payloads(parsed, note_pl) # type: ignore
elif variant == ("note_chunks",):
if raw_chunks is None:
raw_chunks = [{
"chunk_id": f"{note_pl.get('note_id', 'note')}#1",
"text": _safe_text(parsed),
"window": _safe_text(parsed),
"order": 1,
"path": note_pl.get("path", ""),
}]
return make_chunk_payloads(note_pl, raw_chunks) # type: ignore
except Exception as e:
last_err = e
raise RuntimeError(f"make_chunk_payloads invocation failed: {last_err}")
def _call_build_edges_for_note(parsed, chunk_payloads: List[Dict], note_scope_refs: bool) -> List[Dict]:
"""
Ruft build_edges_for_note mit kompatibler Signatur auf.
Historisch:
A) build_edges_for_note(parsed, chunks)
B) build_edges_for_note(parsed, chunks, note_scope_refs=True/False)
"""
sig = inspect.signature(build_edges_for_note)
params = list(sig.parameters.keys())
try:
if "note_scope_refs" in params:
return build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs) # type: ignore
else:
return build_edges_for_note(parsed, chunk_payloads) # type: ignore
except TypeError:
# strenger Fallback: ohne Zusatzparameter
return build_edges_for_note(parsed, chunk_payloads) # type: ignore
# ----------------------------
# Hauptverarbeitung
# ---------------------------- # ----------------------------
def process_file( def process_file(
path: Path, path: Path,
cfg: QdrantConfig, cfg: QdrantConfig,
note_scope_refs: bool = False, note_scope_refs: bool,
apply: bool = False, apply: bool,
purge_before_upsert: bool = False, purge_before_upsert: bool,
) -> Tuple[Optional[dict], List[dict], List[dict]]: ) -> Tuple[Optional[dict], List[dict], List[dict]]:
"""
Liest eine Datei, erzeugt Note-/Chunk-/Edge-Payloads.
Gibt (note_payload, chunk_payloads, edge_payloads) zurück.
"""
try: try:
parsed = read_markdown(str(path)) parsed = read_markdown(str(path))
except Exception as e: except Exception as e:
_print({"path": str(path), "error": f"read_markdown failed: {e.__class__.__name__}: {e}"}) _print({"path": str(path), "error": f"read_markdown failed: {e.__class__.__name__}: {e}"})
return None, [], [] return None, [], []
# Note # Note-Payload
try: try:
note_pl = make_note_payload(parsed, vault_root=str(path.parent.parent)) # vault_root nur für Pfadfelder note_pl = make_note_payload(parsed, vault_root=str(path.parent.parent))
if not isinstance(note_pl, dict): if not isinstance(note_pl, dict):
# Falls ältere make_note_payload-Version etwas anderes liefert
# oder None zurückgibt -> Fallback
note_pl = { note_pl = {
"note_id": parsed.frontmatter.get("id") or path.stem, "note_id": parsed.frontmatter.get("id") or path.stem,
"title": parsed.frontmatter.get("title") or path.stem, "title": parsed.frontmatter.get("title") or path.stem,
@ -154,31 +242,44 @@ def process_file(
"path": str(path).replace("\\", "/"), "path": str(path).replace("\\", "/"),
"tags": parsed.frontmatter.get("tags", []), "tags": parsed.frontmatter.get("tags", []),
} }
# robustes Fulltext-Feld
note_pl["fulltext"] = _safe_text(parsed) note_pl["fulltext"] = _safe_text(parsed)
# Hash-Metadaten anfügen (ohne Änderung der bestehenden Logik deiner DB)
note_pl["hash_signature"] = f"{_hash_mode()}:{_hash_source()}:{_hash_normalize()}" note_pl["hash_signature"] = f"{_hash_mode()}:{_hash_source()}:{_hash_normalize()}"
# Notes-Vector sicherstellen (Zero-Vector, wenn Collection ihn verlangt)
_ensure_note_vector(note_pl, cfg)
except Exception as e: except Exception as e:
_print({"path": str(path), "error": f"make_note_payload failed: {e}"}) _print({"path": str(path), "error": f"make_note_payload failed: {e}"})
return None, [], [] return None, [], []
# Chunks # Roh-Chunks (falls assemble_chunks verfügbar)
raw_chunks: Optional[List[Dict]] = None
if assemble_chunks is not None:
try:
raw_chunks = assemble_chunks(
note_pl.get("note_id", path.stem),
_safe_text(parsed),
parsed.frontmatter.get("type", "concept"),
)
except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "warn": f"assemble_chunks failed: {e}"})
raw_chunks = None
# Chunk-Payloads
try: try:
chunks = make_chunk_payloads(parsed, note_pl) chunk_payloads = _call_make_chunk_payloads(parsed, note_pl, raw_chunks)
if not isinstance(chunks, list): if not isinstance(chunk_payloads, list):
chunks = [] chunk_payloads = []
except Exception as e: except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"make_chunk_payloads failed: {e}"}) _print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"make_chunk_payloads failed: {e}"})
chunks = [] chunk_payloads = []
# Edges # Edges
try: try:
edges = build_edges_for_note(parsed, chunks, note_scope_refs=note_scope_refs) edges = _call_build_edges_for_note(parsed, chunk_payloads, note_scope_refs=note_scope_refs)
except Exception as e: except Exception as e:
_print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"build_edges_for_note failed: {e}"}) _print({"path": str(path), "note_id": note_pl.get("note_id"), "error": f"build_edges_for_note failed: {e}"})
edges = [] edges = []
return note_pl, chunks, edges return note_pl, chunk_payloads, edges
def main(): def main():
@ -186,7 +287,7 @@ def main():
ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Wurzel).") ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Wurzel).")
ap.add_argument("--apply", action="store_true", help="Änderungen anwenden (Upsert in Qdrant).") ap.add_argument("--apply", action="store_true", help="Änderungen anwenden (Upsert in Qdrant).")
ap.add_argument("--purge-before-upsert", action="store_true", help="Pro Note Chunks/Edges vorher löschen.") ap.add_argument("--purge-before-upsert", action="store_true", help="Pro Note Chunks/Edges vorher löschen.")
ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope Referenzen ableiten.") ap.add_argument("--note-scope-refs", action="store_true", help="Note-scope Referenzen (falls unterstützt).")
ap.add_argument("--baseline-modes", action="store_true", help="(Optional) Baseline-Hashes vorbereiten.") ap.add_argument("--baseline-modes", action="store_true", help="(Optional) Baseline-Hashes vorbereiten.")
ap.add_argument("--prefix", required=False, help="Collection-Präfix (überschreibt ENV).") ap.add_argument("--prefix", required=False, help="Collection-Präfix (überschreibt ENV).")
args = ap.parse_args() args = ap.parse_args()
@ -195,7 +296,6 @@ def main():
if not vault.exists(): if not vault.exists():
ap.error(f"Vault nicht gefunden: {vault}") ap.error(f"Vault nicht gefunden: {vault}")
# Prefix bestimmen & Config laden
prefix = _load_prefix(args.prefix) prefix = _load_prefix(args.prefix)
cfg = QdrantConfig.from_env(prefix=prefix) cfg = QdrantConfig.from_env(prefix=prefix)
client = get_client(cfg) client = get_client(cfg)
@ -206,29 +306,27 @@ def main():
_print({"summary": "done", "processed": 0, "prefix": cfg.prefix}) _print({"summary": "done", "processed": 0, "prefix": cfg.prefix})
return return
# Optional Baseline-Aktion (nur Meta-Info / kein Abbruch wenn nicht genutzt)
if args.baseline_modes: if args.baseline_modes:
_print({"action": "baseline", "modes": ["body", "frontmatter", "full"], "source": _hash_source(), "norm": _hash_normalize()}) _print({"action": "baseline", "modes": ["body", "frontmatter", "full"], "source": _hash_source(), "norm": _hash_normalize()})
processed = 0 processed = 0
for idx, p in enumerate(files): for p in files:
note_pl, chunks, edges = process_file( note_pl, chunk_payloads, edges = process_file(
p, p,
cfg, cfg,
note_scope_refs=args.note_scope_refs, note_scope_refs=args.note_scope_refs,
apply=args.apply, apply=args.apply,
purge_before_upsert=args.purge_before_upsert, purge_before_upsert=args.purge_before_upsert,
) )
if not note_pl: if not note_pl:
continue continue
info = { info = {
"note_id": note_pl.get("note_id"), "note_id": note_pl.get("note_id"),
"title": note_pl.get("title"), "title": note_pl.get("title"),
"chunks": len(chunks), "chunks": len(chunk_payloads),
"edges": len(edges), "edges": len(edges),
"changed": True, # Die konkrete Hash-/Sig-Prüfung erfolgt in deinen Payload-Funktionen "changed": True, # Detail-Hashing passiert innerhalb der Payload-Builder
"decision": "apply" if args.apply else "dry-run", "decision": "apply" if args.apply else "dry-run",
"path": str(p.relative_to(vault)).replace("\\", "/"), "path": str(p.relative_to(vault)).replace("\\", "/"),
"hash_mode": _hash_mode(), "hash_mode": _hash_mode(),
@ -238,23 +336,20 @@ def main():
} }
if args.apply: if args.apply:
# Optional: pro Note vorher Chunks/Edges löschen (saubere Aktualisierung)
if args.purge_before_upsert: if args.purge_before_upsert:
try: try:
delete_by_note(client, cfg, note_pl.get("note_id", "")) delete_by_note(client, cfg, note_pl.get("note_id", ""))
except Exception as e: except Exception as e:
_print({"note_id": note_pl.get("note_id"), "warn": f"delete_by_note failed: {e}"}) _print({"note_id": note_pl.get("note_id"), "warn": f"delete_by_note failed: {e}"})
# Upserts
try: try:
upsert_notes(client, cfg, [note_pl]) upsert_notes(client, cfg, [note_pl])
except Exception as e: except Exception as e:
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_notes failed: {e}"}) _print({"note_id": note_pl.get("note_id"), "error": f"upsert_notes failed: {e}"})
continue
if chunks: if chunk_payloads:
try: try:
upsert_chunks(client, cfg, chunks) upsert_chunks(client, cfg, chunk_payloads)
except Exception as e: except Exception as e:
_print({"note_id": note_pl.get("note_id"), "error": f"upsert_chunks failed: {e}"}) _print({"note_id": note_pl.get("note_id"), "error": f"upsert_chunks failed: {e}"})
@ -267,14 +362,12 @@ def main():
_print(info) _print(info)
processed += 1 processed += 1
# Abschlussstatus
counts = count_points(client, cfg)
_print({ _print({
"summary": "done", "summary": "done",
"processed": processed, "processed": processed,
"prefix": cfg.prefix, "prefix": cfg.prefix,
"collections": {"notes": cfg.notes, "chunks": cfg.chunks, "edges": cfg.edges}, "collections": {"notes": cfg.notes, "chunks": cfg.chunks, "edges": cfg.edges},
"counts": counts, "counts": count_points(client, cfg),
}) })