diff --git a/tests/test_edges_smoke.py b/tests/test_edges_smoke.py index 0301e88..32150fb 100644 --- a/tests/test_edges_smoke.py +++ b/tests/test_edges_smoke.py @@ -1,36 +1,36 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -scripts/test_edges_smoke.py +scripts/test_edges_smoke.py — Progressive Ausgabe -Integritäts-Check für mindnet-Edges in Qdrant. -Prüft pro Note: -- Chunk-Anzahl (mindnet_chunks) = belongs_to-Kanten -- next/prev-Kanten: jeweils (#Chunks - 1) -- Dedupe: kein Duplikat (key=(kind,source_id,target_id,scope)) -- references (chunk-scope): vorhanden, wenn Wikilinks erwartet werden (nur Zählreport) -- optional note-scope references/backlink: vorhanden, wenn --note-scope-refs genutzt wurde +Prüft edge-Integrität je Note mit Live-Ausgabe: +- belongs_to == #Chunks +- next == prev == max(#Chunks-1,0) +- Duplikat-Edges (Key: (kind,source_id,target_id,scope)) == 0 +- Zählt references (chunk/note), backlink -Ausgabe: JSON pro Note + Gesamtsummary. +Optionen: + --max-notes N : prüft nur die ersten N Notizen + --limit L : Scroll-Limit pro Anfrage (Default 256) + --flush : jede Zeile sofort flushen """ from __future__ import annotations -import json, os, sys +import os, sys, json, argparse from typing import Dict, Any, List, Tuple, Set from qdrant_client.http import models as rest - from app.core.qdrant import QdrantConfig, get_client def collections(prefix: str) -> Tuple[str, str, str]: return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" -def scroll_ids(client, collection: str, filt: rest.Filter | None = None, payload=False, limit=256): +def scroll_iter(client, collection: str, filt: rest.Filter | None, with_payload: bool, limit: int): next_page = None while True: pts, next_page = client.scroll( collection_name=collection, scroll_filter=filt, - with_payload=payload, + with_payload=with_payload, with_vectors=False, limit=limit, offset=next_page, @@ -39,48 +39,58 @@ def scroll_ids(client, collection: str, filt: rest.Filter | None = None, payload break for p in pts: yield p + if next_page is None: + break -def list_notes(client, prefix: str) -> List[Dict[str, Any]]: +def list_notes(client, prefix: str, limit: int, max_notes: int | None) -> List[Dict[str, Any]]: notes_col, _, _ = collections(prefix) - out = [] - for p in scroll_ids(client, notes_col, None, payload=True): + out: List[Dict[str, Any]] = [] + for p in scroll_iter(client, notes_col, None, True, limit): pl = p.payload or {} nid = pl.get("note_id") or pl.get("id") if nid: - out.append({ - "note_id": nid, - "title": pl.get("title"), - "type": pl.get("type"), - }) + out.append({"note_id": nid, "title": pl.get("title"), "type": pl.get("type")}) + if max_notes is not None and len(out) >= max_notes: + break return out -def count_chunks_for_note(client, prefix: str, note_id: str) -> int: +def count_chunks_for_note(client, prefix: str, note_id: str, limit: int) -> int: _, chunks_col, _ = collections(prefix) - filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - return sum(1 for _ in scroll_ids(client, chunks_col, filt, payload=False)) + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + return sum(1 for _ in scroll_iter(client, chunks_col, f, False, limit)) -def fetch_edges_for_note(client, prefix: str, note_id: str) -> List[Dict[str, Any]]: +def fetch_edges_for_note(client, prefix: str, note_id: str, limit: int) -> List[Dict[str, Any]]: _, _, edges_col = collections(prefix) - filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - return [p.payload or {} for p in scroll_ids(client, edges_col, filt, payload=True)] + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + return [p.payload or {} for p in scroll_iter(client, edges_col, f, True, limit)] def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--max-notes", type=int) + ap.add_argument("--limit", type=int, default=256) + ap.add_argument("--flush", action="store_true") + args = ap.parse_args() + cfg = QdrantConfig.from_env() client = get_client(cfg) - notes = list_notes(client, cfg.prefix) - report = [] + notes = list_notes(client, cfg.prefix, args.limit, args.max_notes) total = {"notes": 0, "chunks": 0, "belongs_to": 0, "next": 0, "prev": 0, "refs_chunk": 0, "refs_note": 0, "backlink": 0, "dup_edges": 0} + for n in notes: nid = n["note_id"] total["notes"] += 1 - chunk_count = count_chunks_for_note(client, cfg.prefix, nid) + chunk_count = count_chunks_for_note(client, cfg.prefix, nid, args.limit) total["chunks"] += chunk_count - edges = fetch_edges_for_note(client, cfg.prefix, nid) + edges = fetch_edges_for_note(client, cfg.prefix, nid, args.limit) by_kind = {} keys: Set[tuple] = set() dup_count = 0 + refs_chunk = 0 + refs_note = 0 + backlink = 0 + for e in edges: k = e.get("kind") by_kind[k] = by_kind.get(k, 0) + 1 @@ -89,27 +99,26 @@ def main(): dup_count += 1 else: keys.add(t) + if k == "references" and e.get("scope") == "chunk": + refs_chunk += 1 + if k == "references" and e.get("scope") == "note": + refs_note += 1 + if k == "backlink": + backlink += 1 - bt = by_kind.get("belongs_to", 0) - nx = by_kind.get("next", 0) - pv = by_kind.get("prev", 0) - rc = by_kind.get("references", 0) if any(e.get("scope") == "chunk" and e.get("kind") == "references" for e in edges) else 0 - rn = sum(1 for e in edges if e.get("scope") == "note" and e.get("kind") == "references") - bl = by_kind.get("backlink", 0) - - total["belongs_to"] += bt - total["next"] += nx - total["prev"] += pv - total["refs_chunk"] += rc - total["refs_note"] += rn - total["backlink"] += bl + total["belongs_to"] += by_kind.get("belongs_to", 0) + total["next"] += by_kind.get("next", 0) + total["prev"] += by_kind.get("prev", 0) + total["refs_chunk"] += refs_chunk + total["refs_note"] += refs_note + total["backlink"] += backlink total["dup_edges"] += dup_count - ok_bt = (bt == chunk_count) - ok_seq = (nx == max(chunk_count - 1, 0) and pv == max(chunk_count - 1, 0)) + ok_bt = (by_kind.get("belongs_to", 0) == chunk_count) + ok_seq = (by_kind.get("next", 0) == max(chunk_count - 1, 0) and by_kind.get("prev", 0) == max(chunk_count - 1, 0)) ok_dup = (dup_count == 0) - report.append({ + line = { "note_id": nid, "title": n.get("title"), "type": n.get("type"), @@ -120,10 +129,12 @@ def main(): "next_prev_match": ok_seq, "no_duplicate_edges": ok_dup, } - }) + } + print(json.dumps(line, ensure_ascii=False)) + if args.flush: + sys.stdout.flush() - out = {"prefix": cfg.prefix, "summary": total, "notes": report} - print(json.dumps(out, ensure_ascii=False, indent=2)) + print(json.dumps({"prefix": cfg.prefix, "summary": total}, ensure_ascii=False)) if __name__ == "__main__": main() diff --git a/tests/test_edges_smoke_fast.py b/tests/test_edges_smoke_fast.py new file mode 100644 index 0000000..3e3a3e9 --- /dev/null +++ b/tests/test_edges_smoke_fast.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +scripts/test_edges_smoke_fast.py — Zählung über /count (sehr schnell) + +Verwendet Qdrant 'count' API je Note/Kind (anstatt scroll), dadurch sehr schnelle Ausführung. +Optionen: + --max-notes N : prüft nur die ersten N Notizen +""" + +from __future__ import annotations +import argparse, json +from typing import Dict, Any, List, Tuple +from qdrant_client.http import models as rest + +from app.core.qdrant import QdrantConfig, get_client + +KINDS = ["belongs_to", "next", "prev", "references", "backlink"] + +def collections(prefix: str) -> Tuple[str, str, str]: + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + +def list_note_ids(client, notes_col: str, max_notes: int | None) -> List[Dict[str, Any]]: + pts, _ = client.scroll(collection_name=notes_col, with_payload=True, with_vectors=False, limit=max_notes or 1024) + out = [] + for p in pts or []: + pl = p.payload or {} + nid = pl.get("note_id") or pl.get("id") + if nid: + out.append({"note_id": nid, "title": pl.get("title"), "type": pl.get("type")}) + return out + +def count_points(client, col: str, filt: rest.Filter) -> int: + res = client.count(collection_name=col, count_filter=filt, exact=True) + return int(getattr(res, "count", 0)) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--max-notes", type=int) + args = ap.parse_args() + + cfg = QdrantConfig.from_env() + client = get_client(cfg) + notes_col, chunks_col, edges_col = collections(cfg.prefix) + notes = list_note_ids(client, notes_col, args.max_notes) + + summary = {"notes": 0, "chunks": 0, "edges": 0} + for n in notes: + nid = n["note_id"] + summary["notes"] += 1 + + filt_note = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) + chunk_cnt = count_points(client, chunks_col, filt_note) + summary["chunks"] += chunk_cnt + + # counts per kind (edges) + by_kind: Dict[str, int] = {} + for k in KINDS: + f = rest.Filter(must=[ + rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid)), + rest.FieldCondition(key="kind", match=rest.MatchValue(value=k)), + ]) + c = count_points(client, edges_col, f) + if c: + by_kind[k] = c + + summary["edges"] += sum(by_kind.values()) + line = {"note_id": nid, "title": n.get("title"), "type": n.get("type"), "chunks": chunk_cnt, "edges_by_kind": by_kind} + print(json.dumps(line, ensure_ascii=False)) + + print(json.dumps({"prefix": cfg.prefix, "summary": summary}, ensure_ascii=False)) + +if __name__ == "__main__": + main()