mindnet/scripts/setup_mindnet_collections.py
Lars e19a5e1f5b
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
scripts/setup_mindnet_collections.py aktualisiert
2025-09-01 17:21:10 +02:00

98 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Richtet die Qdrant-Collections für das mindnet-Projekt ein (V2).
- mindnet_chunks : semantische Suche über Text-Chunks (384/Cosine)
- mindnet_notes : 1 Punkt pro Notiz (optional Titel-Embedding)
- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
Idempotent: legt nur an, wenn nicht vorhanden.
"""
import os
import sys
import json
import argparse
import requests
DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
def rq(method: str, path: str, **kwargs) -> requests.Response:
url = DEFAULT_QDRANT_URL.rstrip("/") + path
r = requests.request(method, url, timeout=15, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
def collection_exists(name: str) -> bool:
r = rq("GET", f"/collections/{name}")
data = r.json()
return data.get("result", {}).get("status") == "green"
def create_collection(name: str, size: int, distance: str = "Cosine") -> None:
if collection_exists(name):
print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return
payload = {"vectors": {"size": size, "distance": distance}}
rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def create_keyword_index(collection: str, field: str) -> None:
payload = {"field_name": field, "field_schema": "keyword"}
rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index keyword on {collection}.{field}")
def create_text_index(collection: str, field: str = "text") -> None:
payload = {"field_name": field, "field_schema": {"type": "text"}}
rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index text on {collection}.{field}")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333")
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)")
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
args = ap.parse_args()
# Hier brauchen wir KEIN global, wir überschreiben einfach die Variable lokal
qdrant_url = args.qdrant_url
# Hilfsfunktion neu binden
def rq(method: str, path: str, **kwargs) -> requests.Response:
url = qdrant_url.rstrip("/") + path
r = requests.request(method, url, timeout=15, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
# Ab hier wie gehabt
chunks = f"{args.prefix}_chunks"
notes = f"{args.prefix}_notes"
edges = f"{args.prefix}_edges"
# 1) Collections anlegen
create_collection(chunks, size=args.dim, distance=args.distance)
create_collection(notes, size=args.dim, distance=args.distance)
create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor
# 2) Indizes setzen
for f in ["note_id", "Status", "Typ", "title", "path"]:
create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]:
create_keyword_index(chunks, f)
create_text_index(chunks, "text")
for f in ["note_id", "title", "path", "Typ", "Status"]:
create_keyword_index(notes, f)
for f in ["tags", "Rolle"]:
create_keyword_index(notes, f)
for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]:
create_keyword_index(edges, f)
# 3) Übersicht ausgeben
r = rq("GET", "/collections")
print("\n[Info] Collections vorhanden:")
print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False))