diff --git a/scripts/setup_mindnet_collections.py b/scripts/setup_mindnet_collections.py index 3518301..8f7541d 100644 --- a/scripts/setup_mindnet_collections.py +++ b/scripts/setup_mindnet_collections.py @@ -1,97 +1,146 @@ #!/usr/bin/env python3 """ -Richtet die Qdrant-Collections für das mindnet-Projekt ein (V2). +Richtet die Qdrant-Collections für das mindnet-Projekt ein. -- mindnet_chunks : semantische Suche über Text-Chunks (384/Cosine) -- mindnet_notes : 1 Punkt pro Notiz (optional Titel-Embedding) +Collections: +- mindnet_chunks : semantische Suche über Markdown-Text-Chunks (Vektor: dim/Cosine) +- mindnet_notes : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding) - mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload) -Idempotent: legt nur an, wenn nicht vorhanden. +Eigenschaften: +- Idempotent: legt nur an, wenn eine Collection noch nicht existiert +- Legt sinnvolle Payload-Indizes an (keyword/text) +- Ohne "global"-Seiteneffekte; Qdrant-URL wird sauber übergeben + +Aufrufbeispiel: + python3 setup_mindnet_collections.py \ + --qdrant-url http://127.0.0.1:6333 \ + --prefix mindnet \ + --dim 384 \ + --distance Cosine """ -import os -import sys -import json +from __future__ import annotations + import argparse +import json +import sys +from dataclasses import dataclass +from typing import Any, Dict + import requests -DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333") -def rq(method: str, path: str, **kwargs) -> requests.Response: - url = DEFAULT_QDRANT_URL.rstrip("/") + path - r = requests.request(method, url, timeout=15, **kwargs) - if not r.ok: - raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}") - return r +@dataclass +class QdrantHTTP: + base_url: str -def collection_exists(name: str) -> bool: - r = rq("GET", f"/collections/{name}") - data = r.json() - return data.get("result", {}).get("status") == "green" + def _url(self, path: str) -> str: + return self.base_url.rstrip("/") + path -def create_collection(name: str, size: int, distance: str = "Cosine") -> None: - if collection_exists(name): - print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.") - return - payload = {"vectors": {"size": size, "distance": distance}} - rq("PUT", f"/collections/{name}", json=payload) - print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).") - -def create_keyword_index(collection: str, field: str) -> None: - payload = {"field_name": field, "field_schema": "keyword"} - rq("PUT", f"/collections/{collection}/index", json=payload) - print(f"[+] Index keyword on {collection}.{field}") - -def create_text_index(collection: str, field: str = "text") -> None: - payload = {"field_name": field, "field_schema": {"type": "text"}} - rq("PUT", f"/collections/{collection}/index", json=payload) - print(f"[+] Index text on {collection}.{field}") - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333") - ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)") - ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)") - ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik") - args = ap.parse_args() - - # Hier brauchen wir KEIN global, wir überschreiben einfach die Variable lokal - qdrant_url = args.qdrant_url - - # Hilfsfunktion neu binden - def rq(method: str, path: str, **kwargs) -> requests.Response: - url = qdrant_url.rstrip("/") + path - r = requests.request(method, url, timeout=15, **kwargs) + def rq(self, method: str, path: str, **kwargs) -> requests.Response: + url = self._url(path) + r = requests.request(method, url, timeout=20, **kwargs) if not r.ok: raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}") return r - # Ab hier wie gehabt - chunks = f"{args.prefix}_chunks" - notes = f"{args.prefix}_notes" - edges = f"{args.prefix}_edges" + def collection_exists(self, name: str) -> bool: + r = self.rq("GET", f"/collections/{name}") + data = r.json() + return data.get("result", {}).get("status") == "green" + + def create_collection(self, name: str, size: int, distance: str = "Cosine") -> None: + if self.collection_exists(name): + print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.") + return + payload = {"vectors": {"size": size, "distance": distance}} + self.rq("PUT", f"/collections/{name}", json=payload) + print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).") + + def create_keyword_index(self, collection: str, field: str) -> None: + payload = {"field_name": field, "field_schema": "keyword"} + self.rq("PUT", f"/collections/{collection}/index", json=payload) + print(f"[+] Index keyword on {collection}.{field}") + + def create_text_index(self, collection: str, field: str = "text") -> None: + payload = {"field_name": field, "field_schema": {"type": "text"}} + self.rq("PUT", f"/collections/{collection}/index", json=payload) + print(f"[+] Index text on {collection}.{field}") + + def list_collections(self) -> Dict[str, Any]: + r = self.rq("GET", "/collections") + return r.json().get("result", {}).get("collections", []) + + +def setup_mindnet_collections(q: QdrantHTTP, prefix: str, dim: int, distance: str) -> None: + chunks = f"{prefix}_chunks" + notes = f"{prefix}_notes" + edges = f"{prefix}_edges" # 1) Collections anlegen - create_collection(chunks, size=args.dim, distance=args.distance) - create_collection(notes, size=args.dim, distance=args.distance) - create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor + q.create_collection(chunks, size=dim, distance=distance) + q.create_collection(notes, size=dim, distance=distance) + q.create_collection(edges, size=1, distance=distance) # Dummy-Vektor - # 2) Indizes setzen + # 2) Indizes definieren + # mindnet_chunks: häufige Filter + Volltext for f in ["note_id", "Status", "Typ", "title", "path"]: - create_keyword_index(chunks, f) + q.create_keyword_index(chunks, f) for f in ["tags", "Rolle", "links"]: - create_keyword_index(chunks, f) - create_text_index(chunks, "text") + q.create_keyword_index(chunks, f) + q.create_text_index(chunks, "text") # Volltextsuche auf dem Textfeld + # mindnet_notes: Metadaten der Notizen for f in ["note_id", "title", "path", "Typ", "Status"]: - create_keyword_index(notes, f) + q.create_keyword_index(notes, f) for f in ["tags", "Rolle"]: - create_keyword_index(notes, f) + q.create_keyword_index(notes, f) - for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]: - create_keyword_index(edges, f) + # mindnet_edges: Graph/Kanten (Filter-only) + for f in [ + "src_note_id", + "dst_note_id", + "src_chunk_id", + "dst_chunk_id", + "link_text", + "relation", + ]: + q.create_keyword_index(edges, f) - # 3) Übersicht ausgeben - r = rq("GET", "/collections") - print("\n[Info] Collections vorhanden:") - print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False)) + +def parse_args() -> argparse.Namespace: + ap = argparse.ArgumentParser() + ap.add_argument("--qdrant-url", default="http://127.0.0.1:6333", help="z.B. http://127.0.0.1:6333") + ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)") + ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (z.B. 384 für MiniLM)") + ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik") + return ap.parse_args() + + +def main() -> int: + args = parse_args() + q = QdrantHTTP(args.qdrant_url) + + try: + # Readiness (optional, ignoriert Fehler) + try: + r = q.rq("GET", "/ready") + if r.text.strip(): + print(f"[ready] {r.text.strip()}") + except Exception as e: + print(f"[warn] /ready nicht erreichbar oder kein Text: {e}") + + setup_mindnet_collections(q, prefix=args.prefix, dim=args.dim, distance=args.distance) + + cols = q.list_collections() + print("\n[Info] Collections vorhanden:") + print(json.dumps(cols, indent=2, ensure_ascii=False)) + return 0 + except Exception as e: + print(f"[ERROR] {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main())