scripts/edges_full_check.py hinzugefügt

2025-11-17 15:17:02 +01:00 · 2025-11-17 15:17:02 +01:00 · cea6d35729
commit cea6d35729
parent 4c56918d8a
1 changed files with 120 additions and 0 deletions
--- a/scripts/edges_full_check.py
+++ b/scripts/edges_full_check.py
@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import json
+from collections import Counter
+from app.core.qdrant import QdrantConfig, get_client
+
+def fetch_all(client, col):
+    points = []
+    next_offset = None
+    while True:
+        res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset)
+        batch = res[0]
+        next_offset = res[1]
+        points.extend(batch)
+        if not next_offset:
+            break
+    return points
+
+def main():
+    cfg = QdrantConfig.from_env()
+    cl = get_client(cfg)
+
+    cn = f"{cfg.prefix}_notes"
+    cc = f"{cfg.prefix}_chunks"
+    ce = f"{cfg.prefix}_edges"
+
+    notes_cnt = cl.count(collection_name=cn, exact=True).count
+    chunks_cnt = cl.count(collection_name=cc, exact=True).count
+    edges_cnt = cl.count(collection_name=ce, exact=True).count
+
+    chunks = fetch_all(cl, cc)
+    edges = fetch_all(cl, ce)
+
+    chunks_by_note = Counter([c.payload.get("note_id") for c in chunks])
+    e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges])
+
+    # pro Note: belongs_to == #chunks; next == prev == max(chunks-1, 0)
+    belongs_by_note = Counter()
+    next_by_note = Counter()
+    prev_by_note = Counter()
+    for e in edges:
+        pl = e.payload
+        nid = pl.get("note_id")
+        k = pl.get("kind") or pl.get("relation")
+        if k == "belongs_to":
+            belongs_by_note[nid] += 1
+        elif k == "next":
+            next_by_note[nid] += 1
+        elif k == "prev":
+            prev_by_note[nid] += 1
+
+    per_note = {}
+    ok_belongs = True
+    ok_nextprev = True
+    for nid, ccount in chunks_by_note.items():
+        b = belongs_by_note[nid]
+        n = next_by_note[nid]
+        p = prev_by_note[nid]
+        per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": {
+            "belongs_to_equals_chunks": (b == ccount),
+            "next_prev_match": (n == p == max(ccount-1, 0)),
+        }}
+        ok_belongs &= (b == ccount)
+        ok_nextprev &= (n == p == max(ccount-1, 0))
+
+    # Rule-Statistiken & Dubletten-Prüfung
+    explicit = defaults = callout = inline = 0
+    multi_callout_detected = False
+    callout_key_counts = Counter()
+    dup_keys = set()
+    seen = set()
+
+    for e in edges:
+        pl = e.payload
+        rule = (pl.get("rule_id") or "")
+        kind = pl.get("kind") or pl.get("relation")
+        cid  = pl.get("chunk_id")
+        sid  = pl.get("source_id"); tid = pl.get("target_id"); rel = kind
+        key  = (sid, tid, rel, rule)
+        if key in seen:
+            dup_keys.add(key)
+        else:
+            seen.add(key)
+
+        if rule.startswith("callout:edge:v1"):
+            callout += 1
+            callout_key_counts[(cid, kind, rule)] += 1
+        if rule.startswith("inline:rel:v1"):
+            inline += 1
+        if rule.startswith("edge_defaults:"):
+            defaults += 1
+        if rule.startswith("explicit:"):
+            explicit += 1
+
+    for _, cnt in callout_key_counts.items():
+        if cnt >= 2:
+            multi_callout_detected = True
+            break
+
+    report = {
+        "prefix": cfg.prefix,
+        "counts": {
+            "notes": notes_cnt,
+            "chunks": chunks_cnt,
+            "edges": edges_cnt,
+            "edges_by_kind": dict(e_by_kind),
+            "explicit_total": explicit,
+            "defaults_total": defaults,
+            "callout_total": callout,
+            "inline_total": inline,
+        },
+        "per_note_checks": per_note,
+        "multi_callout_detected": multi_callout_detected,
+        "has_duplicates": (len(dup_keys) > 0),
+    }
+    print(json.dumps(report, ensure_ascii=False, indent=2))
+
+if __name__ == "__main__":
+    main()