From 3d44de9d8757fadb1c65b44ff9bceaab13618927 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 8 Nov 2025 11:06:57 +0100 Subject: [PATCH] app/core/qdrant.py aktualisiert --- app/core/qdrant.py | 160 +++++++++++++++++++++++++++------------------ 1 file changed, 96 insertions(+), 64 deletions(-) diff --git a/app/core/qdrant.py b/app/core/qdrant.py index 2bf7766..5f67f03 100644 --- a/app/core/qdrant.py +++ b/app/core/qdrant.py @@ -2,42 +2,49 @@ # -*- coding: utf-8 -*- """ Name: app/core/qdrant.py -Version: v1.6.0 (2025-11-08) +Version: v1.7.0 (2025-11-08) -Kurzbeschreibung: +Kurzbeschreibung Qdrant-Client & Collection-Setup für mindnet. - Stellt sicher, dass {prefix}_notes / {prefix}_chunks / {prefix}_edges existieren. - - Edges-Collection nutzt 1D Dummy-Vektor. - - ensure_payload_indexes(...) legt sinnvolle Payload-Indizes an. + - Edges-Collection nutzt 1D Dummy-Vektor (kein Such-Usecase). + - Legt sinnvolle Payload-Indizes an. + - Liefert zähl-/list-/fetch-Helfer, die von Importer/Exporter/Tests genutzt werden. -NEU / Änderungen: +Änderungsverlauf (Relevantes) v1.5.0: * ensure_collections_for_prefix(...) → Wrapper für legacy-Importer * count_points(client, collection) → stabile Zählfunktion (mit Fallback) * get_counts_for_prefix(...) → Summary über alle drei Collections - * truncate_collections(...) → löscht *alle Punkte* in den Collections + * truncate_collections(...) → alle Punkte löschen (Collections bleiben) v1.6.0: - * list_note_ids(client, notes_collection) → liefert alle payload.note_id-Werte - (wird von import_markdown.py v3.9.0 erwartet) + * list_note_ids(client, notes_collection) → alle payload.note_id (unique) + v1.7.0: + * fetch_one_note(client, notes_collection, note_id, with_vectors=False) + → von import_markdown v3.9.0 erwartet; liefert (point_id, payload, vector?) -Aufruf: +Öffentliche API from app.core.qdrant import ( QdrantConfig, get_client, ensure_collections, ensure_payload_indexes, - ensure_collections_for_prefix, count_points, - collection_names, get_counts_for_prefix, truncate_collections, - list_note_ids, + ensure_collections_for_prefix, collection_names, + count_points, get_counts_for_prefix, truncate_collections, + list_note_ids, fetch_one_note, ) """ from __future__ import annotations + import os from dataclasses import dataclass -from typing import Optional, Tuple, Dict, List +from typing import Optional, Tuple, Dict, List, Any from qdrant_client import QdrantClient from qdrant_client.http import models as rest +# --------------------------------------------------------- +# Konfiguration +# --------------------------------------------------------- @dataclass class QdrantConfig: url: str @@ -62,9 +69,9 @@ def get_client(cfg: QdrantConfig) -> QdrantClient: return QdrantClient(url=cfg.url, api_key=cfg.api_key) -# ------------------------------- +# --------------------------------------------------------- # Collection-Erstellung -# ------------------------------- +# --------------------------------------------------------- def _create_notes(client: QdrantClient, name: str, dim: int) -> None: if not client.collection_exists(name): client.create_collection( @@ -85,7 +92,7 @@ def _create_edges(client: QdrantClient, name: str) -> None: if not client.collection_exists(name): client.create_collection( collection_name=name, - vectors_config=rest.VectorParams(size=1, distance=rest.Distance.DOT), # 1D-Dummy + vectors_config=rest.VectorParams(size=1, distance=rest.Distance.DOT), # 1D Dummy ) @@ -98,6 +105,7 @@ def ensure_collections(client: QdrantClient, prefix: str, dim: int, destructive: _create_chunks(client, chunks, dim) if client.collection_exists(edges): + # Robustheit: Prüfen, ob eine VectorConfig existiert; falls nicht → optional neu erstellen try: info = client.get_collection(edges) vectors_cfg = getattr(getattr(info.result, "config", None), "params", None) @@ -118,18 +126,14 @@ def collection_names(prefix: str) -> Tuple[str, str, str]: return (f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges") -# ------------------------------- -# Payload-Indexing -# ------------------------------- -def _safe_create_index(client: QdrantClient, col: str, field: str, schema: rest.PayloadSchemaType): +# --------------------------------------------------------- +# Payload-Indizes +# --------------------------------------------------------- +def _safe_create_index(client: QdrantClient, col: str, field: str, schema: rest.PayloadSchemaType) -> None: try: - client.create_payload_index( - collection_name=col, - field_name=field, - field_schema=schema, - ) + client.create_payload_index(collection_name=col, field_name=field, field_schema=schema) except Exception: - # bereits vorhanden oder nicht unterstütztes Schema → ignorieren + # bereits vorhanden oder Schema nicht unterstützt → ignorieren pass @@ -146,13 +150,14 @@ def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None: _safe_create_index(client, edges, f, rest.PayloadSchemaType.KEYWORD) -# ------------------------------- -# NEU: Abwärtskompatible Helfer -# ------------------------------- -def ensure_collections_for_prefix(client: QdrantClient, prefix: str, dim: int, destructive: bool = False) -> Tuple[str, str, str]: +# --------------------------------------------------------- +# Zähl-/Listen-/Maintenance-Helfer +# --------------------------------------------------------- +def ensure_collections_for_prefix( + client: QdrantClient, prefix: str, dim: int, destructive: bool = False +) -> Tuple[str, str, str]: """ - Legacy-Wrapper, damit ältere Skripte (Importer bis v3.7.x) funktionieren. - Gibt die Collection-Namen zurück. + Legacy-Wrapper (Kompatibilität zu älteren Skripten). """ ensure_collections(client, prefix, dim, destructive=destructive) ensure_payload_indexes(client, prefix) @@ -161,9 +166,9 @@ def ensure_collections_for_prefix(client: QdrantClient, prefix: str, dim: int, d def count_points(client: QdrantClient, collection: str) -> int: """ - Zähle Punkte in einer Collection robust: - 1) bevorzugt client.count(..., exact=True) - 2) Fallback: Scrollen ohne Filter und mitzählen + Zähle Punkte robust: + 1) bevorzugt count(exact=True) + 2) Fallback via Scroll """ try: res = client.count(collection_name=collection, count_filter=None, exact=True) @@ -175,7 +180,6 @@ def count_points(client: QdrantClient, collection: str) -> int: except Exception: pass - # Fallback via Scroll total = 0 next_page = None while True: @@ -203,52 +207,31 @@ def get_counts_for_prefix(client: QdrantClient, prefix: str) -> Dict[str, int]: def truncate_collections(client: QdrantClient, prefix: str) -> None: """ - Löscht *alle Punkte* (nicht die Collections selber) für {prefix}. - Entspricht funktional einem "truncate" in deinem Reset-Skript. + Löscht alle Punkte (Collections bleiben bestehen). """ for col in collection_names(prefix): try: client.delete( collection_name=col, - points_selector=rest.FilterSelector( - filter=rest.Filter(must=[]) # leeres Filter => alle Punkte - ), + points_selector=rest.FilterSelector(filter=rest.Filter(must=[])), wait=True, ) except Exception: - # Fallback: Collection ggf. leer/nicht vorhanden → ignorieren pass -# ------------------------------- -# NEU v1.6.0: list_note_ids -# ------------------------------- def list_note_ids(client: QdrantClient, notes_collection: str, limit: int = 100000) -> List[str]: """ - Liefert alle payload.note_id aus der angegebenen Notes-Collection. - - Wird von import_markdown.py (>= v3.9.0) verwendet, z.B. für Baseline-/Idempotenz-Checks. - - Robust gegen fehlende Felder: ignoriert Punkte ohne 'note_id'. - - Args: - client: QdrantClient - notes_collection: Name der Notes-Collection (z.B. 'mindnet_notes') - limit: harte Obergrenze für die Anzahl der zurückzugebenden IDs - - Returns: - Liste der note_id-Strings (ohne Duplikate, Reihenfolge nicht garantiert). + Liste aller payload.note_id (unique) aus der Notes-Collection. """ out: List[str] = [] seen = set() next_page = None fetched = 0 - - flt = None # kein Filter → alle Punkte - while True: - # scroll_filter in neueren Clients; ältere akzeptieren 'scroll_filter' oder 'filter' points, next_page = client.scroll( collection_name=notes_collection, - scroll_filter=flt, + scroll_filter=None, limit=min(512, max(1, limit - fetched)), with_payload=True, with_vectors=False, @@ -256,7 +239,6 @@ def list_note_ids(client: QdrantClient, notes_collection: str, limit: int = 1000 ) if not points: break - for p in points: pl = p.payload or {} nid = pl.get("note_id") @@ -266,8 +248,58 @@ def list_note_ids(client: QdrantClient, notes_collection: str, limit: int = 1000 fetched += 1 if fetched >= limit: return out - if next_page is None: break - return out + + +# --------------------------------------------------------- +# Fetch-Helfer (NEU für Importer v3.9.0) +# --------------------------------------------------------- +def _match_value(value: Any): + """ + Qdrant HTTP-Models haben je nach Version unterschiedliche Konstruktoren. + Wir versuchen zuerst MatchValue(value=...), dann MatchValue(...) als Fallback. + """ + try: + return rest.MatchValue(value=value) + except TypeError: + return rest.MatchValue(value) # ältere Signatur + + +def fetch_one_note( + client: QdrantClient, + notes_collection: str, + note_id: str, + with_vectors: bool = False, +) -> Optional[Tuple[str, Dict[str, Any], Optional[Any]]]: + """ + Liefert genau eine Note anhand payload.note_id. + Rückgabe: + (point_id, payload_dict, vector_or_None) oder None, falls nicht gefunden. + + Bruchsicher ggü. unterschiedlichen Client-Versionen. + """ + cond = rest.FieldCondition(key="note_id", match=_match_value(note_id)) + flt = rest.Filter(must=[cond]) + + points, _ = client.scroll( + collection_name=notes_collection, + scroll_filter=flt, + limit=1, + with_payload=True, + with_vectors=with_vectors, + ) + if not points: + return None + + p = points[0] + pid = str(getattr(p, "id", "")) if getattr(p, "id", None) is not None else "" + payload = p.payload or {} + vec = None + if with_vectors: + # Vektoren-Struktur ist je nach Clientversion leicht anders + vec = getattr(p, "vector", None) + if vec is None: + vec = payload.get("_vector") # selten als Payload-Schatten + return (pid, payload, vec)