mindnet/scripts/setup_mindnet_collections.py
Lars e9532e8878
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
script_Überprüfung und Kommentarheader
2025-12-28 10:40:28 +01:00

187 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/setup_mindnet_collections.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
Zweck:
-------
Richtet die Qdrant-Collections für das mindnet-Projekt ein.
Legt Collections und Payload-Indizes an (idempotent).
Funktionsweise:
---------------
1. Prüft Qdrant-Verfügbarkeit (optional /ready Endpoint)
2. Legt drei Collections an:
- {prefix}_chunks: Semantische Suche über Text-Chunks (Vektor: dim/Cosine)
- {prefix}_notes: Metadaten pro Notiz (Vektor: dim/Cosine)
- {prefix}_edges: Link-Kanten (Dummy-Vektor size=1, Filter über Payload)
3. Richtet Payload-Indizes ein:
- Keyword-Indizes für häufige Filter-Felder
- Text-Index für Volltextsuche (chunks.text)
Ergebnis-Interpretation:
------------------------
- Ausgabe: Status pro Collection ([+] angelegt, [=] existiert bereits)
- Abschluss: JSON-Liste aller vorhandenen Collections
- Exit-Code 0: Erfolgreich
- Exit-Code 1: Fehler (z.B. Qdrant nicht erreichbar)
Verwendung:
-----------
- Initial-Setup einer neuen mindnet-Instanz
- Nach Qdrant-Reset oder Migration
- Validierung der Collection-Struktur
Hinweise:
---------
- Idempotent: Überspringt existierende Collections
- Nutzt HTTP-API direkt (kein qdrant-client)
- Legt nur Basis-Indizes an (erweiterte Indizes via ensure_payload_indexes)
Aufruf:
-------
python3 -m scripts.setup_mindnet_collections --qdrant-url http://127.0.0.1:6333 --prefix mindnet --dim 768
python3 -m scripts.setup_mindnet_collections --prefix mindnet_dev --dim 768 --distance Cosine
Parameter:
----------
--qdrant-url URL Qdrant-URL (Default: http://127.0.0.1:6333)
--prefix TEXT Collection-Präfix (Default: mindnet)
--dim INT Vektor-Dimension (Default: 384, empfohlen: 768 für nomic)
--distance METRIC Cosine | Euclid | Dot (Default: Cosine)
Änderungen:
-----------
v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung
- Dokumentation aktualisiert
v1.0.0: Initial Release
"""
from __future__ import annotations
import argparse
import json
import sys
from dataclasses import dataclass
from typing import Any, Dict
import requests
@dataclass
class QdrantHTTP:
base_url: str
def _url(self, path: str) -> str:
return self.base_url.rstrip("/") + path
def rq(self, method: str, path: str, **kwargs) -> requests.Response:
url = self._url(path)
r = requests.request(method, url, timeout=20, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
def collection_exists(self, name: str) -> bool:
r = self.rq("GET", f"/collections/{name}")
data = r.json()
return data.get("result", {}).get("status") == "green"
def create_collection(self, name: str, size: int, distance: str = "Cosine") -> None:
if self.collection_exists(name):
print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return
payload = {"vectors": {"size": size, "distance": distance}}
self.rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def create_keyword_index(self, collection: str, field: str) -> None:
payload = {"field_name": field, "field_schema": "keyword"}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index keyword on {collection}.{field}")
def create_text_index(self, collection: str, field: str = "text") -> None:
payload = {"field_name": field, "field_schema": {"type": "text"}}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index text on {collection}.{field}")
def list_collections(self) -> Dict[str, Any]:
r = self.rq("GET", "/collections")
return r.json().get("result", {}).get("collections", [])
def setup_mindnet_collections(q: QdrantHTTP, prefix: str, dim: int, distance: str) -> None:
chunks = f"{prefix}_chunks"
notes = f"{prefix}_notes"
edges = f"{prefix}_edges"
# 1) Collections anlegen
q.create_collection(chunks, size=dim, distance=distance)
q.create_collection(notes, size=dim, distance=distance)
q.create_collection(edges, size=1, distance=distance) # Dummy-Vektor
# 2) Indizes definieren
# mindnet_chunks: häufige Filter + Volltext
for f in ["note_id", "Status", "Typ", "title", "path"]:
q.create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]:
q.create_keyword_index(chunks, f)
q.create_text_index(chunks, "text") # Volltextsuche auf dem Textfeld
# mindnet_notes: Metadaten der Notizen
for f in ["note_id", "title", "path", "Typ", "Status"]:
q.create_keyword_index(notes, f)
for f in ["tags", "Rolle"]:
q.create_keyword_index(notes, f)
# mindnet_edges: Graph/Kanten (Filter-only)
for f in [
"src_note_id",
"dst_note_id",
"src_chunk_id",
"dst_chunk_id",
"link_text",
"relation",
]:
q.create_keyword_index(edges, f)
def parse_args() -> argparse.Namespace:
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default="http://127.0.0.1:6333", help="z.B. http://127.0.0.1:6333")
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (z.B. 384 für MiniLM)")
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
return ap.parse_args()
def main() -> int:
args = parse_args()
q = QdrantHTTP(args.qdrant_url)
try:
# Readiness (optional, ignoriert Fehler)
try:
r = q.rq("GET", "/ready")
if r.text.strip():
print(f"[ready] {r.text.strip()}")
except Exception as e:
print(f"[warn] /ready nicht erreichbar oder kein Text: {e}")
setup_mindnet_collections(q, prefix=args.prefix, dim=args.dim, distance=args.distance)
cols = q.list_collections()
print("\n[Info] Collections vorhanden:")
print(json.dumps(cols, indent=2, ensure_ascii=False))
return 0
except Exception as e:
print(f"[ERROR] {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())