mindnet/scripts/setup_mindnet_collections.py
Lars 12043b7752
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
scripts/setup_mindnet_collections.py aktualisiert
2025-09-01 15:00:21 +02:00

97 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Richtet die Qdrant-Collections für das mindnet-Projekt ein.
- mindnet_chunks : semantische Suche über Text-Chunks (384/Cosine)
- mindnet_notes : 1 Punkt pro Notiz (optional Titel-Embedding)
- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
Idempotent: legt nur an, wenn nicht vorhanden.
"""
import os
import sys
import json
import argparse
import requests
DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
def rq(method: str, path: str, **kwargs) -> requests.Response:
url = DEFAULT_QDRANT_URL.rstrip("/") + path
r = requests.request(method, url, timeout=15, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
def collection_exists(name: str) -> bool:
r = rq("GET", f"/collections/{name}")
data = r.json()
return data.get("result", {}).get("status") == "green"
def create_collection(name: str, size: int, distance: str = "Cosine") -> None:
if collection_exists(name):
print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return
payload = {"vectors": {"size": size, "distance": distance}}
rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def create_keyword_index(collection: str, field: str) -> None:
payload = {"field_name": field, "field_schema": "keyword"}
rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index keyword on {collection}.{field}")
def create_text_index(collection: str, field: str = "text") -> None:
payload = {"field_name": field, "field_schema": {"type": "text"}}
rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index text on {collection}.{field}")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333")
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)")
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
args = ap.parse_args()
# Qdrant-URL überschreiben, falls per Argument gesetzt
global DEFAULT_QDRANT_URL
DEFAULT_QDRANT_URL = args.qdrant_url
chunks = f"{args.prefix}_chunks"
notes = f"{args.prefix}_notes"
edges = f"{args.prefix}_edges"
# 1) Collections anlegen
create_collection(chunks, size=args.dim, distance=args.distance)
create_collection(notes, size=args.dim, distance=args.distance)
create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor
# 2) Indizes
for f in ["note_id", "Status", "Typ", "title", "path"]:
create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]:
create_keyword_index(chunks, f)
create_text_index(chunks, "text")
for f in ["note_id", "title", "path", "Typ", "Status"]:
create_keyword_index(notes, f)
for f in ["tags", "Rolle"]:
create_keyword_index(notes, f)
for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]:
create_keyword_index(edges, f)
# 3) Ausgabe
r = rq("GET", "/collections")
print("\n[Info] Collections vorhanden:")
print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False))
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"[ERROR] {e}", file=sys.stderr)
sys.exit(1)