mindnet/scripts/setup_mindnet_collections.py
Lars a7f3fc5784
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
scripts/setup_mindnet_collections.py aktualisiert
2025-09-01 14:56:07 +02:00

99 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Erzeugt Qdrant-Collections für das mindnet-Projekt:
- mindnet_chunks : semantische Suche über Markdown-Text-Chunks (384/Cosine)
- mindnet_notes : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding)
- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
Idempotent: legt nur an, wenn nicht vorhanden. Legt sinnvolle Payload-Indizes an.
"""
import os
import sys
import json
import argparse
import requests
DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
def rq(method: str, path: str, **kwargs) -> requests.Response:
url = DEFAULT_QDRANT_URL.rstrip("/") + path
r = requests.request(method, url, timeout=15, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
def collection_exists(name: str) -> bool:
r = rq("GET", f"/collections/{name}")
data = r.json()
return data.get("result", {}).get("status") == "green"
def create_collection(name: str, size: int, distance: str = "Cosine") -> None:
if collection_exists(name):
print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return
payload = {"vectors": {"size": size, "distance": distance}}
rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def create_keyword_index(collection: str, field: str) -> None:
payload = {"field_name": field, "field_schema": "keyword"}
rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index keyword on {collection}.{field}")
def create_text_index(collection: str, field: str = "text") -> None:
payload = {"field_name": field, "field_schema": {"type": "text"}}
rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index text on {collection}.{field}")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333")
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)")
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
args = ap.parse_args()
global DEFAULT_QDRANT_URL
DEFAULT_QDRANT_URL = args.qdrant_url
chunks = f"{args.prefix}_chunks"
notes = f"{args.prefix}_notes"
edges = f"{args.prefix}_edges"
# 1) Collections anlegen
create_collection(chunks, size=args.dim, distance=args.distance)
create_collection(notes, size=args.dim, distance=args.distance)
create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor
# 2) Indizes definieren
# mindnet_chunks: häufige Filter + Volltext
for f in ["note_id", "Status", "Typ", "title", "path"]:
create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]:
create_keyword_index(chunks, f)
create_text_index(chunks, "text") # Wort-/Phrasensuche
# mindnet_notes: Metadaten der Notizen
for f in ["note_id", "title", "path", "Typ", "Status"]:
create_keyword_index(notes, f)
for f in ["tags", "Rolle"]:
create_keyword_index(notes, f)
# mindnet_edges: Graph/Kanten (Filter-only)
for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]:
create_keyword_index(edges, f)
# 3) Ausgabe
r = rq("GET", "/collections")
print("\n[Info] Collections vorhanden:")
print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False))
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"[ERROR] {e}", file=sys.stderr)
sys.exit(1)