diff --git a/scripts/setup_mindnet_collections.py b/scripts/setup_mindnet_collections.py index a17f3a2..3d5dbaa 100644 --- a/scripts/setup_mindnet_collections.py +++ b/scripts/setup_mindnet_collections.py @@ -1,13 +1,12 @@ #!/usr/bin/env python3 """ -Richtet Qdrant-Collections für dein Mindnet-Projekt ein (idempotent). +Erzeugt Qdrant-Collections für das mindnet-Projekt: -Erzeugt: -- mindnet_chunks (size=384, distance=Cosine) -> semantische Suche über Text-Chunks -- mindnet_notes (size=384, distance=Cosine) -> Notizebene / Facettierung -- mindnet_edges (size=1, distance=Cosine) -> explizite Links (Dummy-Vektor; Filter via Payload) +- mindnet_chunks : semantische Suche über Markdown-Text-Chunks (384/Cosine) +- mindnet_notes : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding) +- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload) -Legt sinnvolle Payload-Indizes an (keyword/text). +Idempotent: legt nur an, wenn nicht vorhanden. Legt sinnvolle Payload-Indizes an. """ import os @@ -16,81 +15,80 @@ import json import argparse import requests -QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333") +DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333") -def api(method: str, path: str, **kwargs) -> requests.Response: - url = QDRANT_URL.rstrip("/") + path +def rq(method: str, path: str, **kwargs) -> requests.Response: + url = DEFAULT_QDRANT_URL.rstrip("/") + path r = requests.request(method, url, timeout=15, **kwargs) if not r.ok: raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}") return r -def exists(collection: str) -> bool: - r = api("GET", f"/collections/{collection}") - j = r.json() - return j.get("result", {}).get("status") == "green" +def collection_exists(name: str) -> bool: + r = rq("GET", f"/collections/{name}") + data = r.json() + return data.get("result", {}).get("status") == "green" -def create_collection(collection: str, size: int, distance: str) -> None: - if exists(collection): - print(f"[=] {collection} existiert bereits.") +def create_collection(name: str, size: int, distance: str = "Cosine") -> None: + if collection_exists(name): + print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.") return payload = {"vectors": {"size": size, "distance": distance}} - api("PUT", f"/collections/{collection}", json=payload) - print(f"[+] Collection {collection} angelegt (size={size}, distance={distance}).") + rq("PUT", f"/collections/{name}", json=payload) + print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).") -def keyword_index(collection: str, field: str) -> None: - api("PUT", f"/collections/{collection}/index", - json={"field_name": field, "field_schema": "keyword"}) - print(f"[+] keyword-Index: {collection}.{field}") +def create_keyword_index(collection: str, field: str) -> None: + payload = {"field_name": field, "field_schema": "keyword"} + rq("PUT", f"/collections/{collection}/index", json=payload) + print(f"[+] Index keyword on {collection}.{field}") -def text_index(collection: str, field: str = "text") -> None: - api("PUT", f"/collections/{collection}/index", - json={"field_name": field, "field_schema": {"type": "text"}}) - print(f"[+] text-Index: {collection}.{field}") +def create_text_index(collection: str, field: str = "text") -> None: + payload = {"field_name": field, "field_schema": {"type": "text"}} + rq("PUT", f"/collections/{collection}/index", json=payload) + print(f"[+] Index text on {collection}.{field}") def main(): - p = argparse.ArgumentParser() - p.add_argument("--qdrant-url", default=QDRANT_URL, help="z.B. http://127.0.0.1:6333") - p.add_argument("--prefix", default="mindnet", help="Präfix für Collections") - p.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (MiniLM: 384)") - p.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], - help="Distanzmetrik") - args = p.parse_args() + ap = argparse.ArgumentParser() + ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333") + ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)") + ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)") + ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik") + args = ap.parse_args() - global QDRANT_URL - QDRANT_URL = args.qdrant_url + global DEFAULT_QDRANT_URL + DEFAULT_QDRANT_URL = args.qdrant_url chunks = f"{args.prefix}_chunks" notes = f"{args.prefix}_notes" edges = f"{args.prefix}_edges" - # 1) Collections - create_collection(chunks, args.dim, args.distance) - create_collection(notes, args.dim, args.distance) - create_collection(edges, 1, args.distance) # Dummy-Vektor + # 1) Collections anlegen + create_collection(chunks, size=args.dim, distance=args.distance) + create_collection(notes, size=args.dim, distance=args.distance) + create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor - # 2) Indizes - # mindnet_chunks + # 2) Indizes definieren + # mindnet_chunks: häufige Filter + Volltext for f in ["note_id", "Status", "Typ", "title", "path"]: - keyword_index(chunks, f) + create_keyword_index(chunks, f) for f in ["tags", "Rolle", "links"]: - keyword_index(chunks, f) - text_index(chunks, "text") + create_keyword_index(chunks, f) + create_text_index(chunks, "text") # Wort-/Phrasensuche - # mindnet_notes + # mindnet_notes: Metadaten der Notizen for f in ["note_id", "title", "path", "Typ", "Status"]: - keyword_index(notes, f) + create_keyword_index(notes, f) for f in ["tags", "Rolle"]: - keyword_index(notes, f) + create_keyword_index(notes, f) - # mindnet_edges + # mindnet_edges: Graph/Kanten (Filter-only) for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]: - keyword_index(edges, f) + create_keyword_index(edges, f) - # 3) Übersicht - coll = api("GET", "/collections").json().get("result", {}).get("collections", []) + # 3) Ausgabe + r = rq("GET", "/collections") print("\n[Info] Collections vorhanden:") - print(json.dumps(coll, indent=2, ensure_ascii=False)) + print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False)) if __name__ == "__main__": try: