diff --git a/scripts/edges_full_check.py b/scripts/edges_full_check.py new file mode 100644 index 0000000..7cb72a1 --- /dev/null +++ b/scripts/edges_full_check.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +from __future__ import annotations +import json +from collections import Counter +from app.core.qdrant import QdrantConfig, get_client + +def fetch_all(client, col): + points = [] + next_offset = None + while True: + res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset) + batch = res[0] + next_offset = res[1] + points.extend(batch) + if not next_offset: + break + return points + +def main(): + cfg = QdrantConfig.from_env() + cl = get_client(cfg) + + cn = f"{cfg.prefix}_notes" + cc = f"{cfg.prefix}_chunks" + ce = f"{cfg.prefix}_edges" + + notes_cnt = cl.count(collection_name=cn, exact=True).count + chunks_cnt = cl.count(collection_name=cc, exact=True).count + edges_cnt = cl.count(collection_name=ce, exact=True).count + + chunks = fetch_all(cl, cc) + edges = fetch_all(cl, ce) + + chunks_by_note = Counter([c.payload.get("note_id") for c in chunks]) + e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges]) + + # pro Note: belongs_to == #chunks; next == prev == max(chunks-1, 0) + belongs_by_note = Counter() + next_by_note = Counter() + prev_by_note = Counter() + for e in edges: + pl = e.payload + nid = pl.get("note_id") + k = pl.get("kind") or pl.get("relation") + if k == "belongs_to": + belongs_by_note[nid] += 1 + elif k == "next": + next_by_note[nid] += 1 + elif k == "prev": + prev_by_note[nid] += 1 + + per_note = {} + ok_belongs = True + ok_nextprev = True + for nid, ccount in chunks_by_note.items(): + b = belongs_by_note[nid] + n = next_by_note[nid] + p = prev_by_note[nid] + per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": { + "belongs_to_equals_chunks": (b == ccount), + "next_prev_match": (n == p == max(ccount-1, 0)), + }} + ok_belongs &= (b == ccount) + ok_nextprev &= (n == p == max(ccount-1, 0)) + + # Rule-Statistiken & Dubletten-Prüfung + explicit = defaults = callout = inline = 0 + multi_callout_detected = False + callout_key_counts = Counter() + dup_keys = set() + seen = set() + + for e in edges: + pl = e.payload + rule = (pl.get("rule_id") or "") + kind = pl.get("kind") or pl.get("relation") + cid = pl.get("chunk_id") + sid = pl.get("source_id"); tid = pl.get("target_id"); rel = kind + key = (sid, tid, rel, rule) + if key in seen: + dup_keys.add(key) + else: + seen.add(key) + + if rule.startswith("callout:edge:v1"): + callout += 1 + callout_key_counts[(cid, kind, rule)] += 1 + if rule.startswith("inline:rel:v1"): + inline += 1 + if rule.startswith("edge_defaults:"): + defaults += 1 + if rule.startswith("explicit:"): + explicit += 1 + + for _, cnt in callout_key_counts.items(): + if cnt >= 2: + multi_callout_detected = True + break + + report = { + "prefix": cfg.prefix, + "counts": { + "notes": notes_cnt, + "chunks": chunks_cnt, + "edges": edges_cnt, + "edges_by_kind": dict(e_by_kind), + "explicit_total": explicit, + "defaults_total": defaults, + "callout_total": callout, + "inline_total": inline, + }, + "per_note_checks": per_note, + "multi_callout_detected": multi_callout_detected, + "has_duplicates": (len(dup_keys) > 0), + } + print(json.dumps(report, ensure_ascii=False, indent=2)) + +if __name__ == "__main__": + main()