scripts/setup_mindnet_collections.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 1s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 1s
This commit is contained in:
parent
e19a5e1f5b
commit
1c484e1ca0
|
|
@ -1,97 +1,146 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Richtet die Qdrant-Collections für das mindnet-Projekt ein (V2).
|
Richtet die Qdrant-Collections für das mindnet-Projekt ein.
|
||||||
|
|
||||||
- mindnet_chunks : semantische Suche über Text-Chunks (384/Cosine)
|
Collections:
|
||||||
- mindnet_notes : 1 Punkt pro Notiz (optional Titel-Embedding)
|
- mindnet_chunks : semantische Suche über Markdown-Text-Chunks (Vektor: dim/Cosine)
|
||||||
|
- mindnet_notes : 1 Punkt pro Notiz (Metadaten, optional Titel-Embedding)
|
||||||
- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
|
- mindnet_edges : explizite Link-Kanten (Dummy-Vektor size=1; Filter über Payload)
|
||||||
|
|
||||||
Idempotent: legt nur an, wenn nicht vorhanden.
|
Eigenschaften:
|
||||||
|
- Idempotent: legt nur an, wenn eine Collection noch nicht existiert
|
||||||
|
- Legt sinnvolle Payload-Indizes an (keyword/text)
|
||||||
|
- Ohne "global"-Seiteneffekte; Qdrant-URL wird sauber übergeben
|
||||||
|
|
||||||
|
Aufrufbeispiel:
|
||||||
|
python3 setup_mindnet_collections.py \
|
||||||
|
--qdrant-url http://127.0.0.1:6333 \
|
||||||
|
--prefix mindnet \
|
||||||
|
--dim 384 \
|
||||||
|
--distance Cosine
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
from __future__ import annotations
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
DEFAULT_QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
|
|
||||||
|
|
||||||
def rq(method: str, path: str, **kwargs) -> requests.Response:
|
@dataclass
|
||||||
url = DEFAULT_QDRANT_URL.rstrip("/") + path
|
class QdrantHTTP:
|
||||||
r = requests.request(method, url, timeout=15, **kwargs)
|
base_url: str
|
||||||
if not r.ok:
|
|
||||||
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
|
|
||||||
return r
|
|
||||||
|
|
||||||
def collection_exists(name: str) -> bool:
|
def _url(self, path: str) -> str:
|
||||||
r = rq("GET", f"/collections/{name}")
|
return self.base_url.rstrip("/") + path
|
||||||
data = r.json()
|
|
||||||
return data.get("result", {}).get("status") == "green"
|
|
||||||
|
|
||||||
def create_collection(name: str, size: int, distance: str = "Cosine") -> None:
|
def rq(self, method: str, path: str, **kwargs) -> requests.Response:
|
||||||
if collection_exists(name):
|
url = self._url(path)
|
||||||
print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.")
|
r = requests.request(method, url, timeout=20, **kwargs)
|
||||||
return
|
|
||||||
payload = {"vectors": {"size": size, "distance": distance}}
|
|
||||||
rq("PUT", f"/collections/{name}", json=payload)
|
|
||||||
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
|
|
||||||
|
|
||||||
def create_keyword_index(collection: str, field: str) -> None:
|
|
||||||
payload = {"field_name": field, "field_schema": "keyword"}
|
|
||||||
rq("PUT", f"/collections/{collection}/index", json=payload)
|
|
||||||
print(f"[+] Index keyword on {collection}.{field}")
|
|
||||||
|
|
||||||
def create_text_index(collection: str, field: str = "text") -> None:
|
|
||||||
payload = {"field_name": field, "field_schema": {"type": "text"}}
|
|
||||||
rq("PUT", f"/collections/{collection}/index", json=payload)
|
|
||||||
print(f"[+] Index text on {collection}.{field}")
|
|
||||||
|
|
||||||
def main():
|
|
||||||
ap = argparse.ArgumentParser()
|
|
||||||
ap.add_argument("--qdrant-url", default=DEFAULT_QDRANT_URL, help="z.B. http://127.0.0.1:6333")
|
|
||||||
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
|
|
||||||
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (384 für all-MiniLM-L6-v2)")
|
|
||||||
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
|
|
||||||
args = ap.parse_args()
|
|
||||||
|
|
||||||
# Hier brauchen wir KEIN global, wir überschreiben einfach die Variable lokal
|
|
||||||
qdrant_url = args.qdrant_url
|
|
||||||
|
|
||||||
# Hilfsfunktion neu binden
|
|
||||||
def rq(method: str, path: str, **kwargs) -> requests.Response:
|
|
||||||
url = qdrant_url.rstrip("/") + path
|
|
||||||
r = requests.request(method, url, timeout=15, **kwargs)
|
|
||||||
if not r.ok:
|
if not r.ok:
|
||||||
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
|
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
|
||||||
return r
|
return r
|
||||||
|
|
||||||
# Ab hier wie gehabt
|
def collection_exists(self, name: str) -> bool:
|
||||||
chunks = f"{args.prefix}_chunks"
|
r = self.rq("GET", f"/collections/{name}")
|
||||||
notes = f"{args.prefix}_notes"
|
data = r.json()
|
||||||
edges = f"{args.prefix}_edges"
|
return data.get("result", {}).get("status") == "green"
|
||||||
|
|
||||||
|
def create_collection(self, name: str, size: int, distance: str = "Cosine") -> None:
|
||||||
|
if self.collection_exists(name):
|
||||||
|
print(f"[=] Collection '{name}' existiert bereits – überspringe Anlage.")
|
||||||
|
return
|
||||||
|
payload = {"vectors": {"size": size, "distance": distance}}
|
||||||
|
self.rq("PUT", f"/collections/{name}", json=payload)
|
||||||
|
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
|
||||||
|
|
||||||
|
def create_keyword_index(self, collection: str, field: str) -> None:
|
||||||
|
payload = {"field_name": field, "field_schema": "keyword"}
|
||||||
|
self.rq("PUT", f"/collections/{collection}/index", json=payload)
|
||||||
|
print(f"[+] Index keyword on {collection}.{field}")
|
||||||
|
|
||||||
|
def create_text_index(self, collection: str, field: str = "text") -> None:
|
||||||
|
payload = {"field_name": field, "field_schema": {"type": "text"}}
|
||||||
|
self.rq("PUT", f"/collections/{collection}/index", json=payload)
|
||||||
|
print(f"[+] Index text on {collection}.{field}")
|
||||||
|
|
||||||
|
def list_collections(self) -> Dict[str, Any]:
|
||||||
|
r = self.rq("GET", "/collections")
|
||||||
|
return r.json().get("result", {}).get("collections", [])
|
||||||
|
|
||||||
|
|
||||||
|
def setup_mindnet_collections(q: QdrantHTTP, prefix: str, dim: int, distance: str) -> None:
|
||||||
|
chunks = f"{prefix}_chunks"
|
||||||
|
notes = f"{prefix}_notes"
|
||||||
|
edges = f"{prefix}_edges"
|
||||||
|
|
||||||
# 1) Collections anlegen
|
# 1) Collections anlegen
|
||||||
create_collection(chunks, size=args.dim, distance=args.distance)
|
q.create_collection(chunks, size=dim, distance=distance)
|
||||||
create_collection(notes, size=args.dim, distance=args.distance)
|
q.create_collection(notes, size=dim, distance=distance)
|
||||||
create_collection(edges, size=1, distance=args.distance) # Dummy-Vektor
|
q.create_collection(edges, size=1, distance=distance) # Dummy-Vektor
|
||||||
|
|
||||||
# 2) Indizes setzen
|
# 2) Indizes definieren
|
||||||
|
# mindnet_chunks: häufige Filter + Volltext
|
||||||
for f in ["note_id", "Status", "Typ", "title", "path"]:
|
for f in ["note_id", "Status", "Typ", "title", "path"]:
|
||||||
create_keyword_index(chunks, f)
|
q.create_keyword_index(chunks, f)
|
||||||
for f in ["tags", "Rolle", "links"]:
|
for f in ["tags", "Rolle", "links"]:
|
||||||
create_keyword_index(chunks, f)
|
q.create_keyword_index(chunks, f)
|
||||||
create_text_index(chunks, "text")
|
q.create_text_index(chunks, "text") # Volltextsuche auf dem Textfeld
|
||||||
|
|
||||||
|
# mindnet_notes: Metadaten der Notizen
|
||||||
for f in ["note_id", "title", "path", "Typ", "Status"]:
|
for f in ["note_id", "title", "path", "Typ", "Status"]:
|
||||||
create_keyword_index(notes, f)
|
q.create_keyword_index(notes, f)
|
||||||
for f in ["tags", "Rolle"]:
|
for f in ["tags", "Rolle"]:
|
||||||
create_keyword_index(notes, f)
|
q.create_keyword_index(notes, f)
|
||||||
|
|
||||||
for f in ["src_note_id", "dst_note_id", "src_chunk_id", "dst_chunk_id", "link_text", "relation"]:
|
# mindnet_edges: Graph/Kanten (Filter-only)
|
||||||
create_keyword_index(edges, f)
|
for f in [
|
||||||
|
"src_note_id",
|
||||||
|
"dst_note_id",
|
||||||
|
"src_chunk_id",
|
||||||
|
"dst_chunk_id",
|
||||||
|
"link_text",
|
||||||
|
"relation",
|
||||||
|
]:
|
||||||
|
q.create_keyword_index(edges, f)
|
||||||
|
|
||||||
# 3) Übersicht ausgeben
|
|
||||||
r = rq("GET", "/collections")
|
def parse_args() -> argparse.Namespace:
|
||||||
print("\n[Info] Collections vorhanden:")
|
ap = argparse.ArgumentParser()
|
||||||
print(json.dumps(r.json().get("result", {}).get("collections", []), indent=2, ensure_ascii=False))
|
ap.add_argument("--qdrant-url", default="http://127.0.0.1:6333", help="z.B. http://127.0.0.1:6333")
|
||||||
|
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
|
||||||
|
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (z.B. 384 für MiniLM)")
|
||||||
|
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
|
||||||
|
return ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
args = parse_args()
|
||||||
|
q = QdrantHTTP(args.qdrant_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Readiness (optional, ignoriert Fehler)
|
||||||
|
try:
|
||||||
|
r = q.rq("GET", "/ready")
|
||||||
|
if r.text.strip():
|
||||||
|
print(f"[ready] {r.text.strip()}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[warn] /ready nicht erreichbar oder kein Text: {e}")
|
||||||
|
|
||||||
|
setup_mindnet_collections(q, prefix=args.prefix, dim=args.dim, distance=args.distance)
|
||||||
|
|
||||||
|
cols = q.list_collections()
|
||||||
|
print("\n[Info] Collections vorhanden:")
|
||||||
|
print(json.dumps(cols, indent=2, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] {e}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user