scripts/setup_mindnet_collections.py aktualisiert

2025-09-01 14:56:07 +02:00 · 2025-09-01 14:56:07 +02:00 · a7f3fc5784
commit a7f3fc5784
parent a5260a2aad
1 changed files with 50 additions and 52 deletions
--- a/scripts/setup_mindnet_collections.py
+++ b/scripts/setup_mindnet_collections.py
@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 """
-Richtet Qdrant-Collections für dein Mindnet-Projekt ein (idempotent).
+Erzeugt Qdrant-Collections für das mindnet-Projekt:

-Erzeugt:
- mindnet_chunks  (size=384, distance=Cosine)  -> semantische Suche über Text-Chunks
- mindnet_notes   (size=384, distance=Cosine)  -> Notizebene / Facettierung
- mindnet_edges   (size=1,   distance=Cosine)  -> explizite Links (Dummy-Vektor; Filter via Payload)
+- mindnet_chunks : semantische Suche über Markdown-Text-Chunks (384/Cosine)
+- mindnet_notes  : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding)
+- mindnet_edges  : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)

-Legt sinnvolle Payload-Indizes an (keyword/text).
+Idempotent: legt nur an, wenn nicht vorhanden. Legt sinnvolle Payload-Indizes an.
 """

 import os
@ -16,81 +15,80 @@ import json
 import argparse
 import requests

-QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
+DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")

-def api(method: str, path: str, **kwargs) -> requests.Response:
-    url = QDRANT_URL.rstrip("/") + path
+def rq(method: str, path: str, **kwargs) -> requests.Response:
+    url = DEFAULT_QDRANT_URL.rstrip("/") + path
    r = requests.request(method, url, timeout=15, **kwargs)
    if not r.ok:
        raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
    return r

-def exists(collection: str) -> bool:
-    r = api("GET", f"/collections/{collection}")
-    j = r.json()
-    return j.get("result", {}).get("status") == "green"
+def collection_exists(name: str) -> bool:
+    r = rq("GET", f"/collections/{name}")
+    data = r.json()
+    return data.get("result", {}).get("status") == "green"

-def create_collection(collection: str, size: int, distance: str) -> None:
-    if exists(collection):
-        print(f"[=] {collection} existiert bereits.")
+def create_collection(name: str, size: int, distance: str = "Cosine") -> None:
+    if collection_exists(name):
+        print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.")
        return
    payload = {"vectors": {"size": size, "distance": distance}}
-    api("PUT", f"/collections/{collection}", json=payload)
-    print(f"[+] Collection {collection} angelegt (size={size}, distance={distance}).")
+    rq("PUT", f"/collections/{name}", json=payload)
+    print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")

-def keyword_index(collection: str, field: str) -> None:
-    api("PUT", f"/collections/{collection}/index",
-        json={"field_name": field, "field_schema": "keyword"})
-    print(f"[+] keyword-Index: {collection}.{field}")
+def create_keyword_index(collection: str, field: str) -> None:
+    payload = {"field_name": field, "field_schema": "keyword"}
+    rq("PUT", f"/collections/{collection}/index", json=payload)
+    print(f"[+] Index keyword on {collection}.{field}")

-def text_index(collection: str, field: str = "text") -> None:
-    api("PUT", f"/collections/{collection}/index",
-        json={"field_name": field, "field_schema": {"type": "text"}})
-    print(f"[+] text-Index: {collection}.{field}")
+def create_text_index(collection: str, field: str = "text") -> None:
+    payload = {"field_name": field, "field_schema": {"type": "text"}}
+    rq("PUT", f"/collections/{collection}/index", json=payload)
+    print(f"[+] Index text on {collection}.{field}")

 def main():
-    p = argparse.ArgumentParser()
-    p.add_argument("--qdrant-url", default=QDRANT_URL, help="z.B. http://127.0.0.1:6333")
-    p.add_argument("--prefix", default="mindnet", help="Präfix für Collections")
-    p.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (MiniLM: 384)")
-    p.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"],
-                   help="Distanzmetrik")
-    args = p.parse_args()
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333")
+    ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
+    ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)")
+    ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
+    args = ap.parse_args()

-    global QDRANT_URL
-    QDRANT_URL = args.qdrant_url
+    global DEFAULT_QDRANT_URL
+    DEFAULT_QDRANT_URL = args.qdrant_url

    chunks = f"{args.prefix}_chunks"
    notes  = f"{args.prefix}_notes"
    edges  = f"{args.prefix}_edges"

-    # 1) Collections
-    create_collection(chunks, args.dim, args.distance)
-    create_collection(notes,  args.dim, args.distance)
-    create_collection(edges,  1,         args.distance)  # Dummy-Vektor
+    # 1) Collections anlegen
+    create_collection(chunks, size=args.dim, distance=args.distance)
+    create_collection(notes,  size=args.dim, distance=args.distance)
+    create_collection(edges,  size=1,       distance=args.distance)  # Dummy-Vektor

-    # 2) Indizes
-    # mindnet_chunks
+    # 2) Indizes definieren
+    # mindnet_chunks: häufige Filter + Volltext
    for f in ["note_id", "Status", "Typ", "title", "path"]:
-        keyword_index(chunks, f)
+        create_keyword_index(chunks, f)
    for f in ["tags", "Rolle", "links"]:
-        keyword_index(chunks, f)
-    text_index(chunks, "text")
+        create_keyword_index(chunks, f)
+    create_text_index(chunks, "text")  # Wort-/Phrasensuche

-    # mindnet_notes
+    # mindnet_notes: Metadaten der Notizen
    for f in ["note_id", "title", "path", "Typ", "Status"]:
-        keyword_index(notes, f)
+        create_keyword_index(notes, f)
    for f in ["tags", "Rolle"]:
-        keyword_index(notes, f)
+        create_keyword_index(notes, f)

-    # mindnet_edges
+    # mindnet_edges: Graph/Kanten (Filter-only)
    for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]:
-        keyword_index(edges, f)
+        create_keyword_index(edges, f)

-    # 3) Übersicht
-    coll = api("GET", "/collections").json().get("result", {}).get("collections", [])
+    # 3) Ausgabe
+    r = rq("GET", "/collections")
    print("\n[Info] Collections vorhanden:")
-    print(json.dumps(coll, indent=2, ensure_ascii=False))
+    print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False))

 if __name__ == "__main__":
    try: