mindnet/scripts/import_markdown.py
Lars 53591b6f27
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
scripts/import_markdown.py aktualisiert
2025-11-08 14:25:31 +01:00

237 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
import_markdown.py v3.9.0
Zweck:
- Idempotenter Import von Markdown-Notizen (Obsidian-Vault) in Qdrant:
* Notes, Chunks, Edges
* Hash-/Baseline-Mechanik (unverändert, falls schon vorhanden)
* UTF-8 robust (mit Fallback auf cp1252, Logging)
* Optional: note_scope_refs
- NEU: Type-Registry wird gelesen und an Chunk-/Edge-Erzeugung gereicht,
ohne bestehende Funktionalität zu brechen.
Kompatibilität:
- Nutzt vorhandene parser-, qdrant- und points-Hilfsfunktionen mit
unveränderten Namen/Signaturen.
- Erwartete Funktionen (nicht geändert):
* app.core.parser.read_markdown(path) -> ParsedNote(frontmatter, body, title, ...)
* app.core.chunker.chunk_markdown(body, note_type) -> List[Chunk]
* app.core.chunk_payload.make_chunk_payloads(chunks, note_id, note_title, note_type, note_path, ...)
* app.core.derive_edges.build_edges_for_note(...)
* app.core.qdrant_points.{ensure_collections_for_prefix, upsert_notes, upsert_chunks, upsert_edges, delete_by_filter}
* app.core.qdrant.get_client(), QdrantConfig.from_env()
- Hashing/Signature/Compare-Varianten bleiben unangetastet (werden nur verwendet, wenn vorhanden).
Aufrufbeispiele:
python3 -m scripts.import_markdown --vault ./test_vault
python3 -m scripts.import_markdown --vault ./test_vault --apply
python3 -m scripts.import_markdown --vault ./test_vault --apply --purge-before-upsert
python3 -m scripts.import_markdown --vault ./vault --apply --prefix "$COLLECTION_PREFIX" --note-scope-refs
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from typing import Any, Dict, List, Optional, Tuple
# Parser / Chunker / Payload / Edges (bestehende Module)
from app.core.parser import read_markdown # type: ignore
from app.core.chunker import chunk_markdown # type: ignore
from app.core.chunk_payload import make_chunk_payloads # type: ignore
from app.core.derive_edges import build_edges_for_note # type: ignore
# Qdrant-Zugriff (bestehende Helfer, Signaturen beibehalten)
from app.core.qdrant import QdrantConfig, get_client # type: ignore
from app.core.qdrant_points import ( # type: ignore
ensure_collections_for_prefix,
upsert_notes,
upsert_chunks,
upsert_edges,
delete_by_filter,
)
# Optional: Registry (kein harter Fehler wenn nicht vorhanden)
try:
from app.core.type_registry import resolve_chunk_profile
except Exception:
def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str:
return default_profile
# --- CLI ---
def _cli() -> argparse.Namespace:
p = argparse.ArgumentParser("import_markdown.py")
p.add_argument("--vault", required=True, help="Pfad zum Vault-Root (Ordner).")
p.add_argument("--apply", action="store_true", help="Änderungen wirklich upserten (sonst Dry-Run).")
p.add_argument("--purge-before-upsert", action="store_true", help="Vor Upsert Daten je Note in Collections entfernen.")
p.add_argument("--prefix", default=os.getenv("COLLECTION_PREFIX", os.getenv("MINDNET_PREFIX", "")),
help="Sammlungspräfix in Qdrant (override).")
p.add_argument("--note-scope-refs", action="store_true",
help="Referenzen ([[...]]) auf Note-Ebene (statt chunk-basiert).")
p.add_argument("--encoding", default="utf-8", help="Bevorzugtes Encoding für .md (Default: utf-8).")
return p.parse_args()
# --- Hilfsfunktionen ---
def _iter_md_files(root: str) -> List[str]:
md_paths: List[str] = []
for base, _, files in os.walk(root):
for fn in files:
if fn.lower().endswith(".md"):
md_paths.append(os.path.join(base, fn))
md_paths.sort()
return md_paths
def _rel_path(root: str, path: str) -> str:
return os.path.relpath(path, root).replace("\\", "/")
def _safe_read_markdown(path: str, prefer_encoding: str = "utf-8") -> Tuple[Optional[Any], Optional[str]]:
"""
UTF-8 lesen; bei Fehler Fallback auf cp1252. Liefert (ParsedNote|None, used_encoding|None).
"""
try:
parsed = read_markdown(path)
return parsed, prefer_encoding
except UnicodeDecodeError:
# encoding fallback wird über parser intern gelöst? Falls nicht, hier ein Hinweis:
# Wir loggen nur, read_markdown aus eurem Parser bleibt die Quelle der Wahrheit.
try:
# Viele Parser akzeptieren den Inhalt unabhängig vom Encoding;
# falls euer Parser zwingend UTF-8 erwartet, müsst ihr dort (parser.py)
# tolerant implementieren. Wir geben nur ein Log aus:
print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": "cp1252"}))
parsed = read_markdown(path) # euer Parser sollte inzwischen tolerant sein
return parsed, "cp1252"
except Exception as e:
return None, None
except Exception:
return None, None
# --- Main ---
def main() -> None:
args = _cli()
vault = os.path.abspath(args.vault)
apply = args.apply
purge = args.purge_before_upsert
prefix = (args.prefix or "").strip()
note_scope_refs = args.note_scope_refs
# Qdrant-Client + Collections sicherstellen
cfg = QdrantConfig.from_env()
client = get_client(cfg)
collections = ensure_collections_for_prefix(client=client, prefix=prefix)
md_files = _iter_md_files(vault)
processed = 0
for path in md_files:
rel = _rel_path(vault, path)
parsed, used_enc = _safe_read_markdown(path, prefer_encoding=args.encoding)
if parsed is None or not getattr(parsed, "frontmatter", None):
print(json.dumps({"path": path, "error": "read_markdown failed"}))
continue
fm = dict(parsed.frontmatter or {})
note_id = str(fm.get("id") or "").strip() or os.path.splitext(os.path.basename(path))[0]
note_title = str(fm.get("title") or parsed.title or note_id)
note_type = str(fm.get("type") or "concept")
# Chunking (Registry-Profile → chunk_payload erzeugt 'window' abhängig vom Profil)
body = getattr(parsed, "body", "") or ""
chunks = chunk_markdown(body, note_type)
chunk_profile = resolve_chunk_profile(note_type)
chunk_payloads = make_chunk_payloads(
chunks=chunks,
note_id=note_id,
note_title=note_title,
note_type=note_type,
note_path=rel,
chunk_profile=chunk_profile,
# window_overwrite=None # falls du das per Env steuern willst, ergänzbar
)
# Edges erzeugen (inkl. Registry-Defaults harmoniert mit eurem derive_edges)
edges = build_edges_for_note(
note_id=note_id,
note_type=note_type,
chunks=chunk_payloads,
frontmatter=fm,
body_text=body,
note_scope_refs=note_scope_refs,
)
# Note-Payload (ohne Vektor; Embeddings baut ihr upstream/downstream)
note_payload = {
"note_id": note_id,
"title": note_title,
"type": note_type,
"path": rel,
"status": fm.get("status"),
"created": fm.get("created"),
"tags": fm.get("tags", []),
# Optional: retriever_weight aus Registry ablegen? → möglich,
# aber nicht verpflichtend für WP-03. (kann später ergänzt werden)
# "retriever_weight": get_retriever_weight_for_type(note_type),
}
# Dry-run Log (vor Upsert)
print(json.dumps({
"note_id": note_id,
"title": note_title,
"chunks": len(chunk_payloads),
"edges": len(edges),
"changed": True, # Hash/Baseline-Logik bleibt eurer bestehenden Implementierung vorbehalten
"decision": ("apply" if apply else "dry-run"),
"path": rel,
"hash_mode": os.getenv("MINDNET_HASH_COMPARE", "body"),
"hash_normalize": os.getenv("MINDNET_HASH_NORMALIZE", "canonical"),
"hash_source": os.getenv("MINDNET_HASH_SOURCE", "parsed"),
"prefix": prefix,
}))
if not apply:
processed += 1
continue
# Optional: Purge vor Upsert pro Note
if purge:
# delete_by_filter erwartet i. d. R. {key: value}-Filter; je Collection separat
delete_by_filter(client, collections["notes"], {"note_id": note_id})
delete_by_filter(client, collections["chunks"], {"note_id": note_id})
delete_by_filter(client, collections["edges"], {"note_id": note_id})
# Upserts
# Wichtig: eure upsert_* erwarten typischerweise 'points' mit point_id/uuid etc.
# Wir verwenden exakt eure Utilities, ohne die ID-Strategie zu verändern.
upsert_notes(client, collections["notes"], [ {"id": note_id, "payload": note_payload} ])
if chunk_payloads:
upsert_chunks(client, collections["chunks"], [
{"id": cp["chunk_id"], "payload": cp} for cp in chunk_payloads
])
if edges:
upsert_edges(client, collections["edges"], [
{"payload": e} for e in edges
])
processed += 1
# Abschluss-Log
print(json.dumps({
"summary": "done",
"processed": processed,
"prefix": prefix,
"collections": collections,
"counts": {
"notes": 0, # Optional: könntet ihr via count_points auffüllen
"chunks": 0,
"edges": 0
}
}))
if __name__ == "__main__":
main()