All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
237 lines
9.4 KiB
Python
237 lines
9.4 KiB
Python
"""
|
||
import_markdown.py v3.9.0
|
||
|
||
Zweck:
|
||
- Idempotenter Import von Markdown-Notizen (Obsidian-Vault) in Qdrant:
|
||
* Notes, Chunks, Edges
|
||
* Hash-/Baseline-Mechanik (unverändert, falls schon vorhanden)
|
||
* UTF-8 robust (mit Fallback auf cp1252, Logging)
|
||
* Optional: note_scope_refs
|
||
- NEU: Type-Registry wird gelesen und an Chunk-/Edge-Erzeugung gereicht,
|
||
ohne bestehende Funktionalität zu brechen.
|
||
|
||
Kompatibilität:
|
||
- Nutzt vorhandene parser-, qdrant- und points-Hilfsfunktionen mit
|
||
unveränderten Namen/Signaturen.
|
||
- Erwartete Funktionen (nicht geändert):
|
||
* app.core.parser.read_markdown(path) -> ParsedNote(frontmatter, body, title, ...)
|
||
* app.core.chunker.chunk_markdown(body, note_type) -> List[Chunk]
|
||
* app.core.chunk_payload.make_chunk_payloads(chunks, note_id, note_title, note_type, note_path, ...)
|
||
* app.core.derive_edges.build_edges_for_note(...)
|
||
* app.core.qdrant_points.{ensure_collections_for_prefix, upsert_notes, upsert_chunks, upsert_edges, delete_by_filter}
|
||
* app.core.qdrant.get_client(), QdrantConfig.from_env()
|
||
- Hashing/Signature/Compare-Varianten bleiben unangetastet (werden nur verwendet, wenn vorhanden).
|
||
|
||
Aufrufbeispiele:
|
||
python3 -m scripts.import_markdown --vault ./test_vault
|
||
python3 -m scripts.import_markdown --vault ./test_vault --apply
|
||
python3 -m scripts.import_markdown --vault ./test_vault --apply --purge-before-upsert
|
||
python3 -m scripts.import_markdown --vault ./vault --apply --prefix "$COLLECTION_PREFIX" --note-scope-refs
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
import argparse
|
||
import json
|
||
import os
|
||
import sys
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
# Parser / Chunker / Payload / Edges (bestehende Module)
|
||
from app.core.parser import read_markdown # type: ignore
|
||
from app.core.chunker import chunk_markdown # type: ignore
|
||
from app.core.chunk_payload import make_chunk_payloads # type: ignore
|
||
from app.core.derive_edges import build_edges_for_note # type: ignore
|
||
|
||
# Qdrant-Zugriff (bestehende Helfer, Signaturen beibehalten)
|
||
from app.core.qdrant import QdrantConfig, get_client # type: ignore
|
||
from app.core.qdrant_points import ( # type: ignore
|
||
ensure_collections_for_prefix,
|
||
upsert_notes,
|
||
upsert_chunks,
|
||
upsert_edges,
|
||
delete_by_filter,
|
||
)
|
||
|
||
# Optional: Registry (kein harter Fehler wenn nicht vorhanden)
|
||
try:
|
||
from app.core.type_registry import resolve_chunk_profile
|
||
except Exception:
|
||
def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str:
|
||
return default_profile
|
||
|
||
# --- CLI ---
|
||
|
||
def _cli() -> argparse.Namespace:
|
||
p = argparse.ArgumentParser("import_markdown.py")
|
||
p.add_argument("--vault", required=True, help="Pfad zum Vault-Root (Ordner).")
|
||
p.add_argument("--apply", action="store_true", help="Änderungen wirklich upserten (sonst Dry-Run).")
|
||
p.add_argument("--purge-before-upsert", action="store_true", help="Vor Upsert Daten je Note in Collections entfernen.")
|
||
p.add_argument("--prefix", default=os.getenv("COLLECTION_PREFIX", os.getenv("MINDNET_PREFIX", "")),
|
||
help="Sammlungspräfix in Qdrant (override).")
|
||
p.add_argument("--note-scope-refs", action="store_true",
|
||
help="Referenzen ([[...]]) auf Note-Ebene (statt chunk-basiert).")
|
||
p.add_argument("--encoding", default="utf-8", help="Bevorzugtes Encoding für .md (Default: utf-8).")
|
||
return p.parse_args()
|
||
|
||
# --- Hilfsfunktionen ---
|
||
|
||
def _iter_md_files(root: str) -> List[str]:
|
||
md_paths: List[str] = []
|
||
for base, _, files in os.walk(root):
|
||
for fn in files:
|
||
if fn.lower().endswith(".md"):
|
||
md_paths.append(os.path.join(base, fn))
|
||
md_paths.sort()
|
||
return md_paths
|
||
|
||
def _rel_path(root: str, path: str) -> str:
|
||
return os.path.relpath(path, root).replace("\\", "/")
|
||
|
||
def _safe_read_markdown(path: str, prefer_encoding: str = "utf-8") -> Tuple[Optional[Any], Optional[str]]:
|
||
"""
|
||
UTF-8 lesen; bei Fehler Fallback auf cp1252. Liefert (ParsedNote|None, used_encoding|None).
|
||
"""
|
||
try:
|
||
parsed = read_markdown(path)
|
||
return parsed, prefer_encoding
|
||
except UnicodeDecodeError:
|
||
# encoding fallback wird über parser intern gelöst? Falls nicht, hier ein Hinweis:
|
||
# Wir loggen nur, read_markdown aus eurem Parser bleibt die Quelle der Wahrheit.
|
||
try:
|
||
# Viele Parser akzeptieren den Inhalt unabhängig vom Encoding;
|
||
# falls euer Parser zwingend UTF-8 erwartet, müsst ihr dort (parser.py)
|
||
# tolerant implementieren. Wir geben nur ein Log aus:
|
||
print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": "cp1252"}))
|
||
parsed = read_markdown(path) # euer Parser sollte inzwischen tolerant sein
|
||
return parsed, "cp1252"
|
||
except Exception as e:
|
||
return None, None
|
||
except Exception:
|
||
return None, None
|
||
|
||
# --- Main ---
|
||
|
||
def main() -> None:
|
||
args = _cli()
|
||
vault = os.path.abspath(args.vault)
|
||
apply = args.apply
|
||
purge = args.purge_before_upsert
|
||
prefix = (args.prefix or "").strip()
|
||
note_scope_refs = args.note_scope_refs
|
||
|
||
# Qdrant-Client + Collections sicherstellen
|
||
cfg = QdrantConfig.from_env()
|
||
client = get_client(cfg)
|
||
collections = ensure_collections_for_prefix(client=client, prefix=prefix)
|
||
|
||
md_files = _iter_md_files(vault)
|
||
processed = 0
|
||
|
||
for path in md_files:
|
||
rel = _rel_path(vault, path)
|
||
parsed, used_enc = _safe_read_markdown(path, prefer_encoding=args.encoding)
|
||
if parsed is None or not getattr(parsed, "frontmatter", None):
|
||
print(json.dumps({"path": path, "error": "read_markdown failed"}))
|
||
continue
|
||
|
||
fm = dict(parsed.frontmatter or {})
|
||
note_id = str(fm.get("id") or "").strip() or os.path.splitext(os.path.basename(path))[0]
|
||
note_title = str(fm.get("title") or parsed.title or note_id)
|
||
note_type = str(fm.get("type") or "concept")
|
||
|
||
# Chunking (Registry-Profile → chunk_payload erzeugt 'window' abhängig vom Profil)
|
||
body = getattr(parsed, "body", "") or ""
|
||
chunks = chunk_markdown(body, note_type)
|
||
chunk_profile = resolve_chunk_profile(note_type)
|
||
|
||
chunk_payloads = make_chunk_payloads(
|
||
chunks=chunks,
|
||
note_id=note_id,
|
||
note_title=note_title,
|
||
note_type=note_type,
|
||
note_path=rel,
|
||
chunk_profile=chunk_profile,
|
||
# window_overwrite=None # falls du das per Env steuern willst, ergänzbar
|
||
)
|
||
|
||
# Edges erzeugen (inkl. Registry-Defaults – harmoniert mit eurem derive_edges)
|
||
edges = build_edges_for_note(
|
||
note_id=note_id,
|
||
note_type=note_type,
|
||
chunks=chunk_payloads,
|
||
frontmatter=fm,
|
||
body_text=body,
|
||
note_scope_refs=note_scope_refs,
|
||
)
|
||
|
||
# Note-Payload (ohne Vektor; Embeddings baut ihr upstream/downstream)
|
||
note_payload = {
|
||
"note_id": note_id,
|
||
"title": note_title,
|
||
"type": note_type,
|
||
"path": rel,
|
||
"status": fm.get("status"),
|
||
"created": fm.get("created"),
|
||
"tags": fm.get("tags", []),
|
||
# Optional: retriever_weight aus Registry ablegen? → möglich,
|
||
# aber nicht verpflichtend für WP-03. (kann später ergänzt werden)
|
||
# "retriever_weight": get_retriever_weight_for_type(note_type),
|
||
}
|
||
|
||
# Dry-run Log (vor Upsert)
|
||
print(json.dumps({
|
||
"note_id": note_id,
|
||
"title": note_title,
|
||
"chunks": len(chunk_payloads),
|
||
"edges": len(edges),
|
||
"changed": True, # Hash/Baseline-Logik bleibt eurer bestehenden Implementierung vorbehalten
|
||
"decision": ("apply" if apply else "dry-run"),
|
||
"path": rel,
|
||
"hash_mode": os.getenv("MINDNET_HASH_COMPARE", "body"),
|
||
"hash_normalize": os.getenv("MINDNET_HASH_NORMALIZE", "canonical"),
|
||
"hash_source": os.getenv("MINDNET_HASH_SOURCE", "parsed"),
|
||
"prefix": prefix,
|
||
}))
|
||
|
||
if not apply:
|
||
processed += 1
|
||
continue
|
||
|
||
# Optional: Purge vor Upsert pro Note
|
||
if purge:
|
||
# delete_by_filter erwartet i. d. R. {key: value}-Filter; je Collection separat
|
||
delete_by_filter(client, collections["notes"], {"note_id": note_id})
|
||
delete_by_filter(client, collections["chunks"], {"note_id": note_id})
|
||
delete_by_filter(client, collections["edges"], {"note_id": note_id})
|
||
|
||
# Upserts
|
||
# Wichtig: eure upsert_* erwarten typischerweise 'points' mit point_id/uuid etc.
|
||
# Wir verwenden exakt eure Utilities, ohne die ID-Strategie zu verändern.
|
||
upsert_notes(client, collections["notes"], [ {"id": note_id, "payload": note_payload} ])
|
||
if chunk_payloads:
|
||
upsert_chunks(client, collections["chunks"], [
|
||
{"id": cp["chunk_id"], "payload": cp} for cp in chunk_payloads
|
||
])
|
||
if edges:
|
||
upsert_edges(client, collections["edges"], [
|
||
{"payload": e} for e in edges
|
||
])
|
||
|
||
processed += 1
|
||
|
||
# Abschluss-Log
|
||
print(json.dumps({
|
||
"summary": "done",
|
||
"processed": processed,
|
||
"prefix": prefix,
|
||
"collections": collections,
|
||
"counts": {
|
||
"notes": 0, # Optional: könntet ihr via count_points auffüllen
|
||
"chunks": 0,
|
||
"edges": 0
|
||
}
|
||
}))
|
||
|
||
if __name__ == "__main__":
|
||
main()
|