mindnet/scripts/setup_mindnet_collections.py
Lars 1c484e1ca0
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 1s
scripts/setup_mindnet_collections.py aktualisiert
2025-09-01 17:27:07 +02:00

147 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Richtet die Qdrant-Collections für das mindnet-Projekt ein.
Collections:
- mindnet_chunks : semantische Suche über Markdown-Text-Chunks (Vektor: dim/Cosine)
- mindnet_notes : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding)
- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
Eigenschaften:
- Idempotent: legt nur an, wenn eine Collection noch nicht existiert
- Legt sinnvolle Payload-Indizes an (keyword/text)
- Ohne "global"-Seiteneffekte; Qdrant-URL wird sauber übergeben
Aufrufbeispiel:
python3 setup_mindnet_collections.py \
--qdrant-url http://127.0.0.1:6333 \
--prefix mindnet \
--dim 384 \
--distance Cosine
"""
from __future__ import annotations
import argparse
import json
import sys
from dataclasses import dataclass
from typing import Any, Dict
import requests
@dataclass
class QdrantHTTP:
base_url: str
def _url(self, path: str) -> str:
return self.base_url.rstrip("/") + path
def rq(self, method: str, path: str, **kwargs) -> requests.Response:
url = self._url(path)
r = requests.request(method, url, timeout=20, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
def collection_exists(self, name: str) -> bool:
r = self.rq("GET", f"/collections/{name}")
data = r.json()
return data.get("result", {}).get("status") == "green"
def create_collection(self, name: str, size: int, distance: str = "Cosine") -> None:
if self.collection_exists(name):
print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return
payload = {"vectors": {"size": size, "distance": distance}}
self.rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def create_keyword_index(self, collection: str, field: str) -> None:
payload = {"field_name": field, "field_schema": "keyword"}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index keyword on {collection}.{field}")
def create_text_index(self, collection: str, field: str = "text") -> None:
payload = {"field_name": field, "field_schema": {"type": "text"}}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index text on {collection}.{field}")
def list_collections(self) -> Dict[str, Any]:
r = self.rq("GET", "/collections")
return r.json().get("result", {}).get("collections", [])
def setup_mindnet_collections(q: QdrantHTTP, prefix: str, dim: int, distance: str) -> None:
chunks = f"{prefix}_chunks"
notes = f"{prefix}_notes"
edges = f"{prefix}_edges"
# 1) Collections anlegen
q.create_collection(chunks, size=dim, distance=distance)
q.create_collection(notes, size=dim, distance=distance)
q.create_collection(edges, size=1, distance=distance) # Dummy-Vektor
# 2) Indizes definieren
# mindnet_chunks: häufige Filter + Volltext
for f in ["note_id", "Status", "Typ", "title", "path"]:
q.create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]:
q.create_keyword_index(chunks, f)
q.create_text_index(chunks, "text") # Volltextsuche auf dem Textfeld
# mindnet_notes: Metadaten der Notizen
for f in ["note_id", "title", "path", "Typ", "Status"]:
q.create_keyword_index(notes, f)
for f in ["tags", "Rolle"]:
q.create_keyword_index(notes, f)
# mindnet_edges: Graph/Kanten (Filter-only)
for f in [
"src_note_id",
"dst_note_id",
"src_chunk_id",
"dst_chunk_id",
"link_text",
"relation",
]:
q.create_keyword_index(edges, f)
def parse_args() -> argparse.Namespace:
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default="http://127.0.0.1:6333", help="z.B. http://127.0.0.1:6333")
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (z.B. 384 für MiniLM)")
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
return ap.parse_args()
def main() -> int:
args = parse_args()
q = QdrantHTTP(args.qdrant_url)
try:
# Readiness (optional, ignoriert Fehler)
try:
r = q.rq("GET", "/ready")
if r.text.strip():
print(f"[ready] {r.text.strip()}")
except Exception as e:
print(f"[warn] /ready nicht erreichbar oder kein Text: {e}")
setup_mindnet_collections(q, prefix=args.prefix, dim=args.dim, distance=args.distance)
cols = q.list_collections()
print("\n[Info] Collections vorhanden:")
print(json.dumps(cols, indent=2, ensure_ascii=False))
return 0
except Exception as e:
print(f"[ERROR] {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())