#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import annotations import json from collections import Counter from app.core.qdrant import QdrantConfig, get_client def fetch_all(client, col): points = [] next_offset = None while True: res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset) batch = res[0] next_offset = res[1] points.extend(batch) if not next_offset: break return points def main(): cfg = QdrantConfig.from_env() cl = get_client(cfg) cn = f"{cfg.prefix}_notes" cc = f"{cfg.prefix}_chunks" ce = f"{cfg.prefix}_edges" notes_cnt = cl.count(collection_name=cn, exact=True).count chunks_cnt = cl.count(collection_name=cc, exact=True).count edges_cnt = cl.count(collection_name=ce, exact=True).count chunks = fetch_all(cl, cc) edges = fetch_all(cl, ce) chunks_by_note = Counter([c.payload.get("note_id") for c in chunks]) e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges]) # pro Note: belongs_to == #chunks; next == prev == max(chunks-1, 0) belongs_by_note = Counter() next_by_note = Counter() prev_by_note = Counter() for e in edges: pl = e.payload nid = pl.get("note_id") k = pl.get("kind") or pl.get("relation") if k == "belongs_to": belongs_by_note[nid] += 1 elif k == "next": next_by_note[nid] += 1 elif k == "prev": prev_by_note[nid] += 1 per_note = {} ok_belongs = True ok_nextprev = True for nid, ccount in chunks_by_note.items(): b = belongs_by_note[nid] n = next_by_note[nid] p = prev_by_note[nid] per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": { "belongs_to_equals_chunks": (b == ccount), "next_prev_match": (n == p == max(ccount-1, 0)), }} ok_belongs &= (b == ccount) ok_nextprev &= (n == p == max(ccount-1, 0)) # Rule-Statistiken & Dubletten-Prüfung explicit = defaults = callout = inline = 0 multi_callout_detected = False callout_key_counts = Counter() dup_keys = set() seen = set() for e in edges: pl = e.payload rule = (pl.get("rule_id") or "") kind = pl.get("kind") or pl.get("relation") cid = pl.get("chunk_id") sid = pl.get("source_id"); tid = pl.get("target_id"); rel = kind key = (sid, tid, rel, rule) if key in seen: dup_keys.add(key) else: seen.add(key) if rule.startswith("callout:edge:v1"): callout += 1 callout_key_counts[(cid, kind, rule)] += 1 if rule.startswith("inline:rel:v1"): inline += 1 if rule.startswith("edge_defaults:"): defaults += 1 if rule.startswith("explicit:"): explicit += 1 for _, cnt in callout_key_counts.items(): if cnt >= 2: multi_callout_detected = True break report = { "prefix": cfg.prefix, "counts": { "notes": notes_cnt, "chunks": chunks_cnt, "edges": edges_cnt, "edges_by_kind": dict(e_by_kind), "explicit_total": explicit, "defaults_total": defaults, "callout_total": callout, "inline_total": inline, }, "per_note_checks": per_note, "multi_callout_detected": multi_callout_detected, "has_duplicates": (len(dup_keys) > 0), } print(json.dumps(report, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()