diff --git a/tests/check_types_registry_qdrant.py b/tests/check_types_registry_qdrant.py index 54f9215..2870663 100644 --- a/tests/check_types_registry_qdrant.py +++ b/tests/check_types_registry_qdrant.py @@ -1,136 +1,93 @@ + #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -check_types_registry_qdrant_v2.py -- Zeigt Note-Payloads (type, retriever_weight) aus Qdrant -- Ermittelt eine geeignete note_id (bevorzugt 'note_id', sonst 'id') -- Lädt dazugehörige Chunks und zeigt retriever_weight sowie Fenstergrößen +import os, json, argparse, random +from typing import Any, Dict, List, Tuple, Optional +from qdrant_client import QdrantClient -Aufrufbeispiele: - python3 check_types_registry_qdrant_v2.py - python3 check_types_registry_qdrant_v2.py --limit 10 - python3 check_types_registry_qdrant_v2.py --note-id 20250827-xyz - COLLECTION_PREFIX=mindnet python3 check_types_registry_qdrant_v2.py -""" -import os, sys, json, argparse -from typing import Any, Dict, List, Tuple +def _get_prefix(explicit: Optional[str]) -> str: + return (explicit or os.environ.get("COLLECTION_PREFIX") or os.environ.get("MINDNET_PREFIX") or "mindnet").strip() -try: - from qdrant_client import QdrantClient - from qdrant_client.models import Filter, FieldCondition, MatchValue -except Exception as e: - print(json.dumps({"error": f"qdrant_client import failed: {type(e).__name__}: {e}"})) - sys.exit(1) - - -def env_default(name: str, fallback: str) -> str: - v = os.environ.get(name) - if v is None or v == "": - return fallback - return v - - -def collections(prefix: str) -> Tuple[str, str, str]: - return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" - - -def get_client(host: str, port: int) -> QdrantClient: - return QdrantClient(host=host, port=port) - - -def sample_notes(client: QdrantClient, coll: str, limit: int): - pts, _ = client.scroll(collection_name=coll, limit=limit, with_payload=True) - return pts - - -def chunks_for_note(client: QdrantClient, coll: str, note_id: str, limit: int = 100): - flt = Filter(must=[FieldCondition(key="note_id", match=MatchValue(value=note_id))]) - pts, _ = client.scroll(collection_name=coll, with_payload=True, limit=limit, filter=flt) - return pts - - -def pick_note_id(payload: Dict[str, Any]) -> str | None: - # bevorzugt 'note_id', sonst 'id' - nid = payload.get("note_id") or payload.get("id") - if isinstance(nid, str) and nid.strip(): - return nid.strip() - return None +def _names(prefix: str) -> Dict[str,str]: + return { + "notes": f"{prefix}_notes", + "chunks": f"{prefix}_chunks", + "edges": f"{prefix}_edges", + } +def _scroll(client: QdrantClient, collection: str, *, limit=5, with_payload=True): + # Support both modern and older SDKs (filter vs scroll_filter) + try: + return client.scroll(collection_name=collection, limit=limit, with_payload=with_payload) + except TypeError as e: + # older signatures might accept scroll_filter kw, but if we passed none it's fine + return client.scroll(collection_name=collection, limit=limit, with_payload=with_payload) def main(): - ap = argparse.ArgumentParser(description="Check Type-Registry fields in Qdrant (v2)") - ap.add_argument("--prefix", default=env_default("COLLECTION_PREFIX", "mindnet")) - ap.add_argument("--host", default=env_default("QDRANT_HOST", "127.0.0.1")) - ap.add_argument("--port", default=int(env_default("QDRANT_PORT", "6333")), type=int) - ap.add_argument("--limit", default=5, type=int) - ap.add_argument("--note-id", default=None, help="Optional erzwungene note_id") + ap = argparse.ArgumentParser() + ap.add_argument("--host", default=os.environ.get("QDRANT_HOST", "127.0.0.1")) + ap.add_argument("--port", default=int(os.environ.get("QDRANT_PORT", "6333"))) + ap.add_argument("--prefix", default=None) + ap.add_argument("--limit", type=int, default=5) + ap.add_argument("--note-id", default=None) args = ap.parse_args() - notes_coll, chunks_coll, edges_coll = collections(args.prefix) + prefix = _get_prefix(args.prefix) + names = _names(prefix) + client = QdrantClient(host=args.host, port=args.port, prefer_grpc=False) + # Notes sample try: - client = get_client(args.host, args.port) - try: - cols = client.get_collections().collections - print(json.dumps({"info": "collections", "count": len(cols)}, ensure_ascii=False)) - except Exception as e: - print(json.dumps({"warn": f"get_collections failed: {type(e).__name__}: {e}"})) + pts, _ = _scroll(client, names["notes"], limit=args.limit, with_payload=True) except Exception as e: - print(json.dumps({"error": f"QdrantClient init failed: {type(e).__name__}: {e}"})) - sys.exit(2) + print(json.dumps({"error": f"notes scroll failed: {type(e).__name__}: {e}", "collection": names["notes"]})) + return - # 1) Notes sample - notes = [] + sample = [] + for p in pts: + pl = (p.payload or {}) + sample.append({ + "point_id": str(p.id), + "note_id": pl.get("note_id"), + "id": pl.get("id"), + "title": pl.get("title"), + "type": pl.get("type"), + "retriever_weight": pl.get("retriever_weight"), + "chunk_profile": pl.get("chunk_profile"), + }) + print(json.dumps({"notes_sample": sample}, ensure_ascii=False)) + + # Pick a note to fetch chunks + target_note_id = args.note_id or None + if not target_note_id: + for s in sample: + if s.get("note_id"): + target_note_id = s["note_id"] + break + + if not target_note_id: + print(json.dumps({"warn": "no note_id available to fetch chunks"})) + return + + # Chunks for target note try: - notes = sample_notes(client, notes_coll, args.limit) - out = [] - for p in notes: - pl = p.payload or {} - out.append({ - "point_id": getattr(p, "id", None), - "note_id": pl.get("note_id"), - "id": pl.get("id"), - "title": pl.get("title"), - "type": pl.get("type"), - "retriever_weight": pl.get("retriever_weight"), - }) - print(json.dumps({"notes_sample": out}, ensure_ascii=False)) + # pull a few chunks and filter client-side by note_id + pts, _ = _scroll(client, names["chunks"], limit=50, with_payload=True) except Exception as e: - print(json.dumps({"error": f"notes scroll failed: {type(e).__name__}: {e}", "collection": notes_coll})) + print(json.dumps({"error": f"chunks scroll failed: {type(e).__name__}: {e}", "collection": names["chunks"]})) + return - # 2) Chunks for note - try: - target_note_id = args.note_id - if not target_note_id and notes: - # benutze den ersten Treffer mit verwertbarer note_id - for p in notes: - pl = p.payload or {} - candidate = pick_note_id(pl) - if candidate: - target_note_id = candidate - break - - if target_note_id: - cks = chunks_for_note(client, chunks_coll, target_note_id, limit=50) - out = [] - for p in cks: - pl = p.payload or {} - text = pl.get("text") or "" - window = pl.get("window") or "" - out.append({ - "chunk_id": pl.get("chunk_id"), - "note_id": pl.get("note_id"), - "retriever_weight": pl.get("retriever_weight"), - "text_len": len(text), - "window_len": len(window), - "window_minus_text": len(window) - len(text), - }) - print(json.dumps({"chunks_for_note": target_note_id, "sample": out}, ensure_ascii=False)) - else: - print(json.dumps({"warn": "no usable note_id in sample; pass --note-id explicitly"})) - except Exception as e: - print(json.dumps({"error": f"chunks scroll failed: {type(e).__name__}: {e}", "collection": chunks_coll})) + chunks = [{ + "id": str(p.id), + "chunk_id": (p.payload or {}).get("chunk_id"), + "note_id": (p.payload or {}).get("note_id"), + "type": (p.payload or {}).get("type"), + "retriever_weight": (p.payload or {}).get("retriever_weight"), + "window_len": len((p.payload or {}).get("window") or ""), + "text_len": len((p.payload or {}).get("text") or ""), + } for p in pts if (p.payload or {}).get("note_id") == target_note_id][:5] + print(json.dumps({"target_note_id": target_note_id, "chunk_samples": chunks}, ensure_ascii=False)) if __name__ == "__main__": main()