scripts/setup_mindnet_collections.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-09-01 14:56:07 +02:00
parent a5260a2aad
commit a7f3fc5784

View File

@ -1,13 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Richtet Qdrant-Collections für dein Mindnet-Projekt ein (idempotent). Erzeugt Qdrant-Collections für das mindnet-Projekt:
Erzeugt: - mindnet_chunks : semantische Suche über Markdown-Text-Chunks (384/Cosine)
- mindnet_chunks (size=384, distance=Cosine) -> semantische Suche über Text-Chunks - mindnet_notes : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding)
- mindnet_notes (size=384, distance=Cosine) -> Notizebene / Facettierung - mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
- mindnet_edges (size=1, distance=Cosine) -> explizite Links (Dummy-Vektor; Filter via Payload)
Legt sinnvolle Payload-Indizes an (keyword/text). Idempotent: legt nur an, wenn nicht vorhanden. Legt sinnvolle Payload-Indizes an.
""" """
import os import os
@ -16,81 +15,80 @@ import json
import argparse import argparse
import requests import requests
QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333") DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
def api(method: str, path: str, **kwargs) -> requests.Response: def rq(method: str, path: str, **kwargs) -> requests.Response:
url = QDRANT_URL.rstrip("/") + path url = DEFAULT_QDRANT_URL.rstrip("/") + path
r = requests.request(method, url, timeout=15, **kwargs) r = requests.request(method, url, timeout=15, **kwargs)
if not r.ok: if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}") raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r return r
def exists(collection: str) -> bool: def collection_exists(name: str) -> bool:
r = api("GET", f"/collections/{collection}") r = rq("GET", f"/collections/{name}")
j = r.json() data = r.json()
return j.get("result", {}).get("status") == "green" return data.get("result", {}).get("status") == "green"
def create_collection(collection: str, size: int, distance: str) -> None: def create_collection(name: str, size: int, distance: str = "Cosine") -> None:
if exists(collection): if collection_exists(name):
print(f"[=] {collection} existiert bereits.") print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return return
payload = {"vectors": {"size": size, "distance": distance}} payload = {"vectors": {"size": size, "distance": distance}}
api("PUT", f"/collections/{collection}", json=payload) rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection {collection} angelegt (size={size}, distance={distance}).") print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def keyword_index(collection: str, field: str) -> None: def create_keyword_index(collection: str, field: str) -> None:
api("PUT", f"/collections/{collection}/index", payload = {"field_name": field, "field_schema": "keyword"}
json={"field_name": field, "field_schema": "keyword"}) rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] keyword-Index: {collection}.{field}") print(f"[+] Index keyword on {collection}.{field}")
def text_index(collection: str, field: str = "text") -> None: def create_text_index(collection: str, field: str = "text") -> None:
api("PUT", f"/collections/{collection}/index", payload = {"field_name": field, "field_schema": {"type": "text"}}
json={"field_name": field, "field_schema": {"type": "text"}}) rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] text-Index: {collection}.{field}") print(f"[+] Index text on {collection}.{field}")
def main(): def main():
p = argparse.ArgumentParser() ap = argparse.ArgumentParser()
p.add_argument("--qdrant-url", default=QDRANT_URL, help="z.B. http://127.0.0.1:6333") ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333")
p.add_argument("--prefix", default="mindnet", help="Präfix für Collections") ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
p.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (MiniLM: 384)") ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)")
p.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
help="Distanzmetrik") args = ap.parse_args()
args = p.parse_args()
global QDRANT_URL global DEFAULT_QDRANT_URL
QDRANT_URL = args.qdrant_url DEFAULT_QDRANT_URL = args.qdrant_url
chunks = f"{args.prefix}_chunks" chunks = f"{args.prefix}_chunks"
notes = f"{args.prefix}_notes" notes = f"{args.prefix}_notes"
edges = f"{args.prefix}_edges" edges = f"{args.prefix}_edges"
# 1) Collections # 1) Collections anlegen
create_collection(chunks, args.dim, args.distance) create_collection(chunks, size=args.dim, distance=args.distance)
create_collection(notes, args.dim, args.distance) create_collection(notes, size=args.dim, distance=args.distance)
create_collection(edges, 1, args.distance) # Dummy-Vektor create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor
# 2) Indizes # 2) Indizes definieren
# mindnet_chunks # mindnet_chunks: häufige Filter + Volltext
for f in ["note_id", "Status", "Typ", "title", "path"]: for f in ["note_id", "Status", "Typ", "title", "path"]:
keyword_index(chunks, f) create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]: for f in ["tags", "Rolle", "links"]:
keyword_index(chunks, f) create_keyword_index(chunks, f)
text_index(chunks, "text") create_text_index(chunks, "text") # Wort-/Phrasensuche
# mindnet_notes # mindnet_notes: Metadaten der Notizen
for f in ["note_id", "title", "path", "Typ", "Status"]: for f in ["note_id", "title", "path", "Typ", "Status"]:
keyword_index(notes, f) create_keyword_index(notes, f)
for f in ["tags", "Rolle"]: for f in ["tags", "Rolle"]:
keyword_index(notes, f) create_keyword_index(notes, f)
# mindnet_edges # mindnet_edges: Graph/Kanten (Filter-only)
for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]: for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]:
keyword_index(edges, f) create_keyword_index(edges, f)
# 3) Übersicht # 3) Ausgabe
coll = api("GET", "/collections").json().get("result", {}).get("collections", []) r = rq("GET", "/collections")
print("\n[Info] Collections vorhanden:") print("\n[Info] Collections vorhanden:")
print(json.dumps(coll, indent=2, ensure_ascii=False)) print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False))
if __name__ == "__main__": if __name__ == "__main__":
try: try: