scripts/edges_full_check.py aktualisiert

2025-11-17 16:04:21 +01:00 · 2025-11-17 16:04:21 +01:00 · 3e08c8347e
commit 3e08c8347e
parent c691123d2d
1 changed files with 126 additions and 89 deletions
--- a/scripts/edges_full_check.py
+++ b/scripts/edges_full_check.py
@ -3,123 +3,160 @@
 from __future__ import annotations

 import json
-from collections import Counter
+import os
+from collections import Counter, defaultdict
+from typing import Dict, Tuple
+
+from qdrant_client.http import models as rest
 from app.core.qdrant import QdrantConfig, get_client


-def fetch_all(client, col):
+def _rel(payload: dict) -> str:
+    return payload.get("relation") or payload.get("kind") or "edge"
+
+
+def _count_by_kind(edges_payloads):
+    c = Counter()
+    for pl in edges_payloads:
+        c[_rel(pl)] += 1
+    return dict(c)
+
+
+def _is_explicit(pl: dict) -> bool:
+    rid = (pl.get("rule_id") or "").lower()
+    return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:")
+
+
+def _is_default(pl: dict) -> bool:
+    rid = (pl.get("rule_id") or "").lower()
+    return rid.startswith("edge_defaults:")
+
+
+def _is_callout(pl: dict) -> bool:
+    rid = (pl.get("rule_id") or "").lower()
+    return rid.startswith("callout:")
+
+
+def _is_inline(pl: dict) -> bool:
+    rid = (pl.get("rule_id") or "").lower()
+    return rid.startswith("inline:")
+
+
+def _scroll_all(client, col_name: str):
    points = []
-    next_offset = None
+    next_page = None
    while True:
-        res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset)
-        batch = res[0]
-        next_offset = res[1]
-        points.extend(batch)
-        if not next_offset:
+        res, next_page = client.scroll(
+            collection_name=col_name,
+            with_payload=True,
+            with_vectors=False,
+            limit=2048,
+            offset=next_page,
+        )
+        points.extend(res)
+        if next_page is None:
            break
    return points


-def is_callout_rule(rule_id: str) -> bool:
-    if not rule_id:
-        return False
-    r = rule_id.lower()
-    return r.startswith("callout:edge:v1") or ("callout" in r)
-
-
 def main():
    cfg = QdrantConfig.from_env()
-    cl = get_client(cfg)
+    client = get_client(cfg)
+    prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix)

-    cn = f"{cfg.prefix}_notes"
-    cc = f"{cfg.prefix}_chunks"
-    ce = f"{cfg.prefix}_edges"
+    cols = {
+        "notes": f"{prefix}_notes",
+        "chunks": f"{prefix}_chunks",
+        "edges": f"{prefix}_edges",
+    }

-    notes_cnt = cl.count(collection_name=cn, exact=True).count
-    chunks_cnt = cl.count(collection_name=cc, exact=True).count
-    edges_cnt = cl.count(collection_name=ce, exact=True).count
+    # 1) Alle Edges lesen
+    edge_pts = _scroll_all(client, cols["edges"])
+    edges_payloads = [p.payload or {} for p in edge_pts]

-    chunks = fetch_all(cl, cc)
-    edges = fetch_all(cl, ce)
-
-    chunks_by_note = Counter([c.payload.get("note_id") for c in chunks])
-    e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges])
-
-    belongs_by_note = Counter()
-    next_by_note = Counter()
-    prev_by_note = Counter()
-    for e in edges:
-        pl = e.payload
-        nid = pl.get("note_id")
-        k = pl.get("kind") or pl.get("relation")
-        if k == "belongs_to":
-            belongs_by_note[nid] += 1
-        elif k == "next":
-            next_by_note[nid] += 1
-        elif k == "prev":
-            prev_by_note[nid] += 1
+    # 2) Summen & Klassifizierungen
+    edges_by_kind = _count_by_kind(edges_payloads)
+    explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl))
+    defaults_total = sum(1 for pl in edges_payloads if _is_default(pl))
+    callout_total = sum(1 for pl in edges_payloads if _is_callout(pl))
+    inline_total = sum(1 for pl in edges_payloads if _is_inline(pl))

+    # 3) Per-Note-Checks
    per_note = {}
-    ok_belongs = True
-    ok_nextprev = True
-    for nid, ccount in chunks_by_note.items():
-        b = belongs_by_note[nid]
-        n = next_by_note[nid]
-        p = prev_by_note[nid]
-        per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": {"belongs_to_equals_chunks": (b == ccount), "next_prev_match": (n == p == max(ccount-1, 0))}}
-        ok_belongs &= (b == ccount)
-        ok_nextprev &= (n == p == max(ccount-1, 0))
+    # chunks je Note
+    chunk_counts: Dict[str, int] = defaultdict(int)
+    for ch in _scroll_all(client, cols["chunks"]):
+        nid = (ch.payload or {}).get("note_id")
+        if nid:
+            chunk_counts[nid] += 1
+
+    # edges je Note
+    edges_by_note: Dict[str, list] = defaultdict(list)
+    for pl in edges_payloads:
+        nid = pl.get("note_id")
+        if nid:
+            edges_by_note[nid].append(pl)

-    explicit = defaults = callout = inline = 0
    multi_callout_detected = False
-    callout_key_counts = Counter()
-    dup_keys = set()
-    seen = set()
+    dup_seen = set()
+    has_duplicates = False

-    for e in edges:
-        pl = e.payload
-        rule = (pl.get("rule_id") or "")
-        kind = pl.get("kind") or pl.get("relation")
-        cid  = pl.get("chunk_id")
-        sid  = pl.get("source_id"); tid = pl.get("target_id"); rel = kind
-        key  = (sid, tid, rel, rule)
-        if key in seen:
-            dup_keys.add(key)
-        else:
-            seen.add(key)
+    for nid, pls in edges_by_note.items():
+        by_kind = Counter(_rel(pl) for pl in pls)
+        belongs_to = by_kind.get("belongs_to", 0)
+        next_cnt = by_kind.get("next", 0)
+        prev_cnt = by_kind.get("prev", 0)
+        chunks = chunk_counts.get(nid, 0)

-        if is_callout_rule(rule):
-            callout += 1
-            callout_key_counts[(cid, kind, rule)] += 1
-        if rule.startswith("inline:rel:v1"):
-            inline += 1
-        if rule.startswith("edge_defaults:"):
-            defaults += 1
-        if rule.startswith("explicit:"):
-            explicit += 1
+        # Duplikate
+        for pl in pls:
+            key = (
+                str(pl.get("source_id") or ""),
+                str(pl.get("target_id") or ""),
+                str(_rel(pl)),
+                str(pl.get("rule_id") or ""),
+            )
+            if key in dup_seen:
+                has_duplicates = True
+            dup_seen.add(key)

-    for _, cnt in callout_key_counts.items():
-        if cnt >= 2:
+        # Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets
+        call_key_counter = Counter(
+            (pl.get("chunk_id"), _rel(pl), pl.get("rule_id"))
+            for pl in pls
+            if _is_callout(pl)
+        )
+        if any(v >= 2 for v in call_key_counter.values()):
            multi_callout_detected = True
-            break

-    report = {
-        "prefix": cfg.prefix,
+        per_note[nid] = {
+            "chunks": chunks,
+            "belongs_to": belongs_to,
+            "next": next_cnt,
+            "prev": prev_cnt,
+            "checks": {
+                "belongs_to_equals_chunks": (belongs_to == chunks),
+                "next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)),
+            },
+        }
+
+    out = {
+        "prefix": prefix,
        "counts": {
-            "notes": notes_cnt,
-            "chunks": chunks_cnt,
-            "edges": edges_cnt,
-            "edges_by_kind": dict(e_by_kind),
-            "explicit_total": explicit,
-            "defaults_total": defaults,
-            "callout_total": callout,
-            "inline_total": inline,
+            "notes": client.count(collection_name=cols["notes"], exact=True).count,
+            "chunks": client.count(collection_name=cols["chunks"], exact=True).count,
+            "edges": client.count(collection_name=cols["edges"], exact=True).count,
+            "edges_by_kind": edges_by_kind,
+            "explicit_total": explicit_total,
+            "defaults_total": defaults_total,
+            "callout_total": callout_total,
+            "inline_total": inline_total,
        },
        "per_note_checks": per_note,
        "multi_callout_detected": multi_callout_detected,
-        "has_duplicates": (len(dup_keys) > 0),
+        "has_duplicates": has_duplicates,
    }
-    print(json.dumps(report, ensure_ascii=False, indent=2))
+    print(json.dumps(out, ensure_ascii=False, indent=2))


 if __name__ == "__main__":