""" import_markdown.py v3.9.0 Zweck: - Idempotenter Import von Markdown-Notizen (Obsidian-Vault) in Qdrant: * Notes, Chunks, Edges * Hash-/Baseline-Mechanik (unverändert, falls schon vorhanden) * UTF-8 robust (mit Fallback auf cp1252, Logging) * Optional: note_scope_refs - NEU: Type-Registry wird gelesen und an Chunk-/Edge-Erzeugung gereicht, ohne bestehende Funktionalität zu brechen. Kompatibilität: - Nutzt vorhandene parser-, qdrant- und points-Hilfsfunktionen mit unveränderten Namen/Signaturen. - Erwartete Funktionen (nicht geändert): * app.core.parser.read_markdown(path) -> ParsedNote(frontmatter, body, title, ...) * app.core.chunker.chunk_markdown(body, note_type) -> List[Chunk] * app.core.chunk_payload.make_chunk_payloads(chunks, note_id, note_title, note_type, note_path, ...) * app.core.derive_edges.build_edges_for_note(...) * app.core.qdrant_points.{ensure_collections_for_prefix, upsert_notes, upsert_chunks, upsert_edges, delete_by_filter} * app.core.qdrant.get_client(), QdrantConfig.from_env() - Hashing/Signature/Compare-Varianten bleiben unangetastet (werden nur verwendet, wenn vorhanden). Aufrufbeispiele: python3 -m scripts.import_markdown --vault ./test_vault python3 -m scripts.import_markdown --vault ./test_vault --apply python3 -m scripts.import_markdown --vault ./test_vault --apply --purge-before-upsert python3 -m scripts.import_markdown --vault ./vault --apply --prefix "$COLLECTION_PREFIX" --note-scope-refs """ from __future__ import annotations import argparse import json import os import sys from typing import Any, Dict, List, Optional, Tuple # Parser / Chunker / Payload / Edges (bestehende Module) from app.core.parser import read_markdown # type: ignore from app.core.chunker import chunk_markdown # type: ignore from app.core.chunk_payload import make_chunk_payloads # type: ignore from app.core.derive_edges import build_edges_for_note # type: ignore # Qdrant-Zugriff (bestehende Helfer, Signaturen beibehalten) from app.core.qdrant import QdrantConfig, get_client # type: ignore from app.core.qdrant_points import ( # type: ignore ensure_collections_for_prefix, upsert_notes, upsert_chunks, upsert_edges, delete_by_filter, ) # Optional: Registry (kein harter Fehler wenn nicht vorhanden) try: from app.core.type_registry import resolve_chunk_profile except Exception: def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str: return default_profile # --- CLI --- def _cli() -> argparse.Namespace: p = argparse.ArgumentParser("import_markdown.py") p.add_argument("--vault", required=True, help="Pfad zum Vault-Root (Ordner).") p.add_argument("--apply", action="store_true", help="Änderungen wirklich upserten (sonst Dry-Run).") p.add_argument("--purge-before-upsert", action="store_true", help="Vor Upsert Daten je Note in Collections entfernen.") p.add_argument("--prefix", default=os.getenv("COLLECTION_PREFIX", os.getenv("MINDNET_PREFIX", "")), help="Sammlungspräfix in Qdrant (override).") p.add_argument("--note-scope-refs", action="store_true", help="Referenzen ([[...]]) auf Note-Ebene (statt chunk-basiert).") p.add_argument("--encoding", default="utf-8", help="Bevorzugtes Encoding für .md (Default: utf-8).") return p.parse_args() # --- Hilfsfunktionen --- def _iter_md_files(root: str) -> List[str]: md_paths: List[str] = [] for base, _, files in os.walk(root): for fn in files: if fn.lower().endswith(".md"): md_paths.append(os.path.join(base, fn)) md_paths.sort() return md_paths def _rel_path(root: str, path: str) -> str: return os.path.relpath(path, root).replace("\\", "/") def _safe_read_markdown(path: str, prefer_encoding: str = "utf-8") -> Tuple[Optional[Any], Optional[str]]: """ UTF-8 lesen; bei Fehler Fallback auf cp1252. Liefert (ParsedNote|None, used_encoding|None). """ try: parsed = read_markdown(path) return parsed, prefer_encoding except UnicodeDecodeError: # encoding fallback wird über parser intern gelöst? Falls nicht, hier ein Hinweis: # Wir loggen nur, read_markdown aus eurem Parser bleibt die Quelle der Wahrheit. try: # Viele Parser akzeptieren den Inhalt unabhängig vom Encoding; # falls euer Parser zwingend UTF-8 erwartet, müsst ihr dort (parser.py) # tolerant implementieren. Wir geben nur ein Log aus: print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": "cp1252"})) parsed = read_markdown(path) # euer Parser sollte inzwischen tolerant sein return parsed, "cp1252" except Exception as e: return None, None except Exception: return None, None # --- Main --- def main() -> None: args = _cli() vault = os.path.abspath(args.vault) apply = args.apply purge = args.purge_before_upsert prefix = (args.prefix or "").strip() note_scope_refs = args.note_scope_refs # Qdrant-Client + Collections sicherstellen cfg = QdrantConfig.from_env() client = get_client(cfg) collections = ensure_collections_for_prefix(client=client, prefix=prefix) md_files = _iter_md_files(vault) processed = 0 for path in md_files: rel = _rel_path(vault, path) parsed, used_enc = _safe_read_markdown(path, prefer_encoding=args.encoding) if parsed is None or not getattr(parsed, "frontmatter", None): print(json.dumps({"path": path, "error": "read_markdown failed"})) continue fm = dict(parsed.frontmatter or {}) note_id = str(fm.get("id") or "").strip() or os.path.splitext(os.path.basename(path))[0] note_title = str(fm.get("title") or parsed.title or note_id) note_type = str(fm.get("type") or "concept") # Chunking (Registry-Profile → chunk_payload erzeugt 'window' abhängig vom Profil) body = getattr(parsed, "body", "") or "" chunks = chunk_markdown(body, note_type) chunk_profile = resolve_chunk_profile(note_type) chunk_payloads = make_chunk_payloads( chunks=chunks, note_id=note_id, note_title=note_title, note_type=note_type, note_path=rel, chunk_profile=chunk_profile, # window_overwrite=None # falls du das per Env steuern willst, ergänzbar ) # Edges erzeugen (inkl. Registry-Defaults – harmoniert mit eurem derive_edges) edges = build_edges_for_note( note_id=note_id, note_type=note_type, chunks=chunk_payloads, frontmatter=fm, body_text=body, note_scope_refs=note_scope_refs, ) # Note-Payload (ohne Vektor; Embeddings baut ihr upstream/downstream) note_payload = { "note_id": note_id, "title": note_title, "type": note_type, "path": rel, "status": fm.get("status"), "created": fm.get("created"), "tags": fm.get("tags", []), # Optional: retriever_weight aus Registry ablegen? → möglich, # aber nicht verpflichtend für WP-03. (kann später ergänzt werden) # "retriever_weight": get_retriever_weight_for_type(note_type), } # Dry-run Log (vor Upsert) print(json.dumps({ "note_id": note_id, "title": note_title, "chunks": len(chunk_payloads), "edges": len(edges), "changed": True, # Hash/Baseline-Logik bleibt eurer bestehenden Implementierung vorbehalten "decision": ("apply" if apply else "dry-run"), "path": rel, "hash_mode": os.getenv("MINDNET_HASH_COMPARE", "body"), "hash_normalize": os.getenv("MINDNET_HASH_NORMALIZE", "canonical"), "hash_source": os.getenv("MINDNET_HASH_SOURCE", "parsed"), "prefix": prefix, })) if not apply: processed += 1 continue # Optional: Purge vor Upsert pro Note if purge: # delete_by_filter erwartet i. d. R. {key: value}-Filter; je Collection separat delete_by_filter(client, collections["notes"], {"note_id": note_id}) delete_by_filter(client, collections["chunks"], {"note_id": note_id}) delete_by_filter(client, collections["edges"], {"note_id": note_id}) # Upserts # Wichtig: eure upsert_* erwarten typischerweise 'points' mit point_id/uuid etc. # Wir verwenden exakt eure Utilities, ohne die ID-Strategie zu verändern. upsert_notes(client, collections["notes"], [ {"id": note_id, "payload": note_payload} ]) if chunk_payloads: upsert_chunks(client, collections["chunks"], [ {"id": cp["chunk_id"], "payload": cp} for cp in chunk_payloads ]) if edges: upsert_edges(client, collections["edges"], [ {"payload": e} for e in edges ]) processed += 1 # Abschluss-Log print(json.dumps({ "summary": "done", "processed": processed, "prefix": prefix, "collections": collections, "counts": { "notes": 0, # Optional: könntet ihr via count_points auffüllen "chunks": 0, "edges": 0 } })) if __name__ == "__main__": main()