diff --git a/app/core/qdrant.py b/app/core/qdrant.py index ffb52a3..ea34d48 100644 --- a/app/core/qdrant.py +++ b/app/core/qdrant.py @@ -1,85 +1,120 @@ +# app/core/qdrant.py from __future__ import annotations -from dataclasses import dataclass -from typing import Tuple + import os +from dataclasses import dataclass +from typing import Optional from qdrant_client import QdrantClient from qdrant_client.http import models as rest -DEFAULT_DIM = int(os.getenv("VECTOR_DIM", "384")) + +# ------------------------------- +# Konfiguration +# ------------------------------- @dataclass class QdrantConfig: url: str - api_key: str | None = None - prefix: str = "mindnet" - dim: int = DEFAULT_DIM + api_key: Optional[str] + prefix: str + dim: int -def _collection_names(prefix: str) -> Tuple[str, str, str]: - """ - Liefert die standardisierten Collection-Namen für Notes, Chunks und Edges. - """ - return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + @staticmethod + def from_env() -> "QdrantConfig": + # URL (bevorzugt) oder Host/Port + url = os.getenv("QDRANT_URL") + if not url: + host = os.getenv("QDRANT_HOST", "127.0.0.1") + port = int(os.getenv("QDRANT_PORT", "6333")) + url = f"http://{host}:{port}" + api_key = os.getenv("QDRANT_API_KEY") or None + + # Collection-Prefix und Vektor-Dimension + prefix = os.getenv("COLLECTION_PREFIX", "mindnet") + dim = int(os.getenv("VECTOR_DIM", "384")) # MiniLM-384 by default + + return QdrantConfig(url=url, api_key=api_key, prefix=prefix, dim=dim) + + +# ------------------------------- +# Client / Setup +# ------------------------------- def get_client(cfg: QdrantConfig) -> QdrantClient: """ Erstellt einen QdrantClient basierend auf der Config. """ - return QdrantClient(url=cfg.url, api_key=cfg.api_key or None, prefer_grpc=False) + return QdrantClient(url=cfg.url, api_key=cfg.api_key) -def ensure_collections(cfg: QdrantConfig) -> Tuple[str, str, str]: + +def ensure_collections(client: QdrantClient, prefix: str, dim: int) -> None: """ - Idempotent: legt {prefix}_notes, {prefix}_chunks, {prefix}_edges an, - falls sie fehlen, und erzeugt sinnvolle Payload-Indizes. + Stellt sicher, dass die drei Collections existieren: + - {prefix}_notes : Vektor-Dim = dim (COSINE) + - {prefix}_chunks : Vektor-Dim = dim (COSINE) + - {prefix}_edges : Vektor-Dim = 1 (DOT) <-- Dummy-Vektor, damit der Python-Client kein 'vector' zwingt + Falls {prefix}_edges bereits vektorlos existiert, wird sie gelöscht und mit 1D neu erstellt. """ - client = get_client(cfg) - notes, chunks, edges = _collection_names(cfg.prefix) + notes = f"{prefix}_notes" + chunks = f"{prefix}_chunks" + edges = f"{prefix}_edges" - # Vektorkonfiguration - note_vec = rest.VectorParams(size=cfg.dim, distance=rest.Distance.COSINE) - chunk_vec = rest.VectorParams(size=cfg.dim, distance=rest.Distance.COSINE) - edge_vec = rest.VectorParams(size=1, distance=rest.Distance.COSINE) # Dummy für edges - - def _create_if_missing(name: str, vparam: rest.VectorParams): - try: - info = client.get_collection(name) - if info and info.status == rest.CollectionStatus.GREEN: - return - except Exception: - pass + # Notes + if not client.collection_exists(notes): client.create_collection( - collection_name=name, - vectors_config=rest.VectorsConfig(params=vparam), - optimizers_config=rest.OptimizersConfigDiff(indexing_threshold=20000), - on_disk_payload=True, + collection_name=notes, + vectors_config=rest.VectorParams(size=dim, distance=rest.Distance.COSINE), ) - _create_if_missing(notes, note_vec) - _create_if_missing(chunks, chunk_vec) - _create_if_missing(edges, edge_vec) + # Chunks + if not client.collection_exists(chunks): + client.create_collection( + collection_name=chunks, + vectors_config=rest.VectorParams(size=dim, distance=rest.Distance.COSINE), + ) - # Payload-Indizes - def _ensure_index(name: str, field: str, kind: rest.PayloadSchemaType): + # Edges: 1D Dummy-Vektor (Workaround) + recreate_edges = False + if client.collection_exists(edges): try: - client.create_payload_index( - collection_name=name, - field_name=field, - field_schema=rest.PayloadSchemaParams(schema=kind), - ) + info = client.get_collection(edges) + # Prüfen, ob Vektor-Konfig existiert + vectors_cfg = getattr(getattr(info.result, "config", None), "params", None) + # Neuere Clients: info.result.config.params.vectors kann VectorParams oder dict sein + has_vectors = getattr(vectors_cfg, "vectors", None) is not None + if not has_vectors: + recreate_edges = True except Exception: - # Index existiert evtl. schon → ignorieren - pass + # Wenn Metadaten nicht lesbar → sicherheitshalber neu anlegen + recreate_edges = True + else: + # Existiert noch nicht → wird gleich erstellt + pass - # Notes-Collection - for f in ("note_id", "type", "status", "project", "area", "path", "tags"): - _ensure_index(notes, f, rest.PayloadSchemaType.KEYWORD) + if recreate_edges and client.collection_exists(edges): + client.delete_collection(edges) - # Chunks-Collection - for f in ("note_id", "type", "tags", "section_title", "section_path", "path", "chunk_index"): - _ensure_index(chunks, f, rest.PayloadSchemaType.KEYWORD) + if not client.collection_exists(edges): + client.create_collection( + collection_name=edges, + vectors_config=rest.VectorParams(size=1, distance=rest.Distance.DOT), + ) - # Edges-Collection - for f in ("src_id", "dst_id", "edge_type", "scope"): - _ensure_index(edges, f, rest.PayloadSchemaType.KEYWORD) - return notes, chunks, edges +# ------------------------------- +# (Optionale) Utility-Funktionen +# ------------------------------- + +def collection_names(prefix: str) -> tuple[str, str, str]: + """Hilfsfunktion, falls du die Namen an einer Stelle brauchst.""" + return (f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges") + + +def wipe_collections(client: QdrantClient, prefix: str) -> None: + """ + Löscht alle drei Collections (nur nutzen, wenn du bewusst neu aufsetzen willst). + """ + for name in collection_names(prefix): + if client.collection_exists(name): + client.delete_collection(name)