diff --git a/scripts/backfill_capability_facets.py b/scripts/backfill_capability_facets.py index 291b185..bd94dec 100644 --- a/scripts/backfill_capability_facets.py +++ b/scripts/backfill_capability_facets.py @@ -1,22 +1,12 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Idempotentes Backfill-Skript für Capability-Facetten in Qdrant. - -- Kompatibel mit qdrant-client 1.15.x: **kein** WithPayloadSelector-Import nötig -- Liest alle Punkte der Collection mit Payload (scroll, with_payload=True) -- Schreibt folgende Felder pro Point nach: - * capability_keys - * capability_ge1 .. capability_ge5 - * capability_eq1 .. capability_eq5 - -Hinweis: Das Skript setzt KEINE Vektoren neu, es aktualisiert nur Payload-Felder. +Backfill Capability-Facetten in Qdrant – v1.2 +Fix: beendet korrekt, wenn `next_page_offset` (offset) None ist. """ - import os -from typing import Dict, Any, List, Tuple, Optional +from typing import Dict, Any, List from qdrant_client import QdrantClient -from qdrant_client.models import Filter # nur für API-Kompatibilität; wird hier leer genutzt COLL = os.getenv("EXERCISE_COLLECTION", "exercises") QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") @@ -26,7 +16,6 @@ BATCH = int(os.getenv("BACKFILL_BATCH", "256")) def _facet_capabilities(caps: Dict[str, Any]) -> Dict[str, List[str]]: caps = caps or {} - def names_where(pred) -> List[str]: out = [] for k, v in caps.items(): @@ -38,20 +27,16 @@ def _facet_capabilities(caps: Dict[str, Any]) -> Dict[str, List[str]]: s = str(k).strip() if s: out.append(s) - # stabil sortieren return sorted({s for s in out}, key=str.casefold) all_keys = sorted({str(k).strip() for k in caps.keys() if str(k).strip()}, key=str.casefold) - return { "capability_keys": all_keys, - # >= N "capability_ge1": names_where(lambda lv: lv >= 1), "capability_ge2": names_where(lambda lv: lv >= 2), "capability_ge3": names_where(lambda lv: lv >= 3), "capability_ge4": names_where(lambda lv: lv >= 4), "capability_ge5": names_where(lambda lv: lv >= 5), - # == N "capability_eq1": names_where(lambda lv: lv == 1), "capability_eq2": names_where(lambda lv: lv == 2), "capability_eq3": names_where(lambda lv: lv == 3), @@ -62,47 +47,48 @@ def _facet_capabilities(caps: Dict[str, Any]) -> Dict[str, List[str]]: def main() -> None: client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT) - - # Sanity: Collection muss existieren info = client.get_collection(COLL) print(f"[Backfill] Collection '{COLL}' ok – vectors={info.config.params.vectors}") - updated = 0 + updated_total = 0 offset = None page = 0 while True: - page += 1 - points, offset = client.scroll( + points, next_offset = client.scroll( collection_name=COLL, - scroll_filter=None, # alles + scroll_filter=None, offset=offset, limit=BATCH, with_payload=True, ) + page += 1 if not points: + print("[Backfill] no more points – done") break + updated_page = 0 for pt in points: pld = pt.payload or {} caps = pld.get("capabilities") or {} facets = _facet_capabilities(caps) - # Nur schreiben, wenn sich etwas ändert oder Felder fehlen - need = False - for k, v in facets.items(): - if pld.get(k) != v: - need = True - break + # nur setzen, wenn sich etwas ändert + need = any(pld.get(k) != v for k, v in facets.items()) if not need: continue - - # set_payload: pro Punkt separat (per-Point Payload) client.set_payload(collection_name=COLL, points=[pt.id], payload=facets) - updated += 1 - print(f"[Backfill] page={page} processed={len(points)} updated_total={updated}") + updated_total += 1 + updated_page += 1 - print(f"[Backfill] done. total_updated={updated}") + print(f"[Backfill] page={page} processed={len(points)} updated_page={updated_page} updated_total={updated_total}") + + # Ende erreicht? Dann nach dieser Seite aussteigen. + if next_offset is None: + break + offset = next_offset + + print(f"[Backfill] done. total_updated={updated_total}") if __name__ == "__main__":