#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ FILE: scripts/setup_mindnet_collections.py VERSION: 2.1.0 (2025-12-15) STATUS: Active COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b) Zweck: ------- Richtet die Qdrant-Collections für das mindnet-Projekt ein. Legt Collections und Payload-Indizes an (idempotent). Funktionsweise: --------------- 1. Prüft Qdrant-Verfügbarkeit (optional /ready Endpoint) 2. Legt drei Collections an: - {prefix}_chunks: Semantische Suche über Text-Chunks (Vektor: dim/Cosine) - {prefix}_notes: Metadaten pro Notiz (Vektor: dim/Cosine) - {prefix}_edges: Link-Kanten (Dummy-Vektor size=1, Filter über Payload) 3. Richtet Payload-Indizes ein: - Keyword-Indizes für häufige Filter-Felder - Text-Index für Volltextsuche (chunks.text) Ergebnis-Interpretation: ------------------------ - Ausgabe: Status pro Collection ([+] angelegt, [=] existiert bereits) - Abschluss: JSON-Liste aller vorhandenen Collections - Exit-Code 0: Erfolgreich - Exit-Code 1: Fehler (z.B. Qdrant nicht erreichbar) Verwendung: ----------- - Initial-Setup einer neuen mindnet-Instanz - Nach Qdrant-Reset oder Migration - Validierung der Collection-Struktur Hinweise: --------- - Idempotent: Überspringt existierende Collections - Nutzt HTTP-API direkt (kein qdrant-client) - Legt nur Basis-Indizes an (erweiterte Indizes via ensure_payload_indexes) Aufruf: ------- python3 -m scripts.setup_mindnet_collections --qdrant-url http://127.0.0.1:6333 --prefix mindnet --dim 768 python3 -m scripts.setup_mindnet_collections --prefix mindnet_dev --dim 768 --distance Cosine Parameter: ---------- --qdrant-url URL Qdrant-URL (Default: http://127.0.0.1:6333) --prefix TEXT Collection-Präfix (Default: mindnet) --dim INT Vektor-Dimension (Default: 384, empfohlen: 768 für nomic) --distance METRIC Cosine | Euclid | Dot (Default: Cosine) Änderungen: ----------- v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung - Dokumentation aktualisiert v1.0.0: Initial Release """ from __future__ import annotations import argparse import json import sys from dataclasses import dataclass from typing import Any, Dict import requests @dataclass class QdrantHTTP: base_url: str def _url(self, path: str) -> str: return self.base_url.rstrip("/") + path def rq(self, method: str, path: str, **kwargs) -> requests.Response: url = self._url(path) r = requests.request(method, url, timeout=20, **kwargs) if not r.ok: raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}") return r def collection_exists(self, name: str) -> bool: r = self.rq("GET", f"/collections/{name}") data = r.json() return data.get("result", {}).get("status") == "green" def create_collection(self, name: str, size: int, distance: str = "Cosine") -> None: if self.collection_exists(name): print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.") return payload = {"vectors": {"size": size, "distance": distance}} self.rq("PUT", f"/collections/{name}", json=payload) print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).") def create_keyword_index(self, collection: str, field: str) -> None: payload = {"field_name": field, "field_schema": "keyword"} self.rq("PUT", f"/collections/{collection}/index", json=payload) print(f"[+] Index keyword on {collection}.{field}") def create_text_index(self, collection: str, field: str = "text") -> None: payload = {"field_name": field, "field_schema": {"type": "text"}} self.rq("PUT", f"/collections/{collection}/index", json=payload) print(f"[+] Index text on {collection}.{field}") def list_collections(self) -> Dict[str, Any]: r = self.rq("GET", "/collections") return r.json().get("result", {}).get("collections", []) def setup_mindnet_collections(q: QdrantHTTP, prefix: str, dim: int, distance: str) -> None: chunks = f"{prefix}_chunks" notes = f"{prefix}_notes" edges = f"{prefix}_edges" # 1) Collections anlegen q.create_collection(chunks, size=dim, distance=distance) q.create_collection(notes, size=dim, distance=distance) q.create_collection(edges, size=1, distance=distance) # Dummy-Vektor # 2) Indizes definieren # mindnet_chunks: häufige Filter + Volltext for f in ["note_id", "Status", "Typ", "title", "path"]: q.create_keyword_index(chunks, f) for f in ["tags", "Rolle", "links"]: q.create_keyword_index(chunks, f) q.create_text_index(chunks, "text") # Volltextsuche auf dem Textfeld # mindnet_notes: Metadaten der Notizen for f in ["note_id", "title", "path", "Typ", "Status"]: q.create_keyword_index(notes, f) for f in ["tags", "Rolle"]: q.create_keyword_index(notes, f) # mindnet_edges: Graph/Kanten (Filter-only) for f in [ "src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation", ]: q.create_keyword_index(edges, f) def parse_args() -> argparse.Namespace: ap = argparse.ArgumentParser() ap.add_argument("--qdrant-url", default="http://127.0.0.1:6333", help="z.B. http://127.0.0.1:6333") ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)") ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (z.B. 384 für MiniLM)") ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik") return ap.parse_args() def main() -> int: args = parse_args() q = QdrantHTTP(args.qdrant_url) try: # Readiness (optional, ignoriert Fehler) try: r = q.rq("GET", "/ready") if r.text.strip(): print(f"[ready] {r.text.strip()}") except Exception as e: print(f"[warn] /ready nicht erreichbar oder kein Text: {e}") setup_mindnet_collections(q, prefix=args.prefix, dim=args.dim, distance=args.distance) cols = q.list_collections() print("\n[Info] Collections vorhanden:") print(json.dumps(cols, indent=2, ensure_ascii=False)) return 0 except Exception as e: print(f"[ERROR] {e}", file=sys.stderr) return 1 if __name__ == "__main__": raise SystemExit(main())