From 3e08c8347eaaefbbc71e758327b1d19e82fed929 Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 17 Nov 2025 16:04:21 +0100 Subject: [PATCH] scripts/edges_full_check.py aktualisiert --- scripts/edges_full_check.py | 215 +++++++++++++++++++++--------------- 1 file changed, 126 insertions(+), 89 deletions(-) diff --git a/scripts/edges_full_check.py b/scripts/edges_full_check.py index 1baccba..90b4ae1 100644 --- a/scripts/edges_full_check.py +++ b/scripts/edges_full_check.py @@ -3,123 +3,160 @@ from __future__ import annotations import json -from collections import Counter +import os +from collections import Counter, defaultdict +from typing import Dict, Tuple + +from qdrant_client.http import models as rest from app.core.qdrant import QdrantConfig, get_client -def fetch_all(client, col): +def _rel(payload: dict) -> str: + return payload.get("relation") or payload.get("kind") or "edge" + + +def _count_by_kind(edges_payloads): + c = Counter() + for pl in edges_payloads: + c[_rel(pl)] += 1 + return dict(c) + + +def _is_explicit(pl: dict) -> bool: + rid = (pl.get("rule_id") or "").lower() + return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:") + + +def _is_default(pl: dict) -> bool: + rid = (pl.get("rule_id") or "").lower() + return rid.startswith("edge_defaults:") + + +def _is_callout(pl: dict) -> bool: + rid = (pl.get("rule_id") or "").lower() + return rid.startswith("callout:") + + +def _is_inline(pl: dict) -> bool: + rid = (pl.get("rule_id") or "").lower() + return rid.startswith("inline:") + + +def _scroll_all(client, col_name: str): points = [] - next_offset = None + next_page = None while True: - res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset) - batch = res[0] - next_offset = res[1] - points.extend(batch) - if not next_offset: + res, next_page = client.scroll( + collection_name=col_name, + with_payload=True, + with_vectors=False, + limit=2048, + offset=next_page, + ) + points.extend(res) + if next_page is None: break return points -def is_callout_rule(rule_id: str) -> bool: - if not rule_id: - return False - r = rule_id.lower() - return r.startswith("callout:edge:v1") or ("callout" in r) - - def main(): cfg = QdrantConfig.from_env() - cl = get_client(cfg) + client = get_client(cfg) + prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix) - cn = f"{cfg.prefix}_notes" - cc = f"{cfg.prefix}_chunks" - ce = f"{cfg.prefix}_edges" + cols = { + "notes": f"{prefix}_notes", + "chunks": f"{prefix}_chunks", + "edges": f"{prefix}_edges", + } - notes_cnt = cl.count(collection_name=cn, exact=True).count - chunks_cnt = cl.count(collection_name=cc, exact=True).count - edges_cnt = cl.count(collection_name=ce, exact=True).count + # 1) Alle Edges lesen + edge_pts = _scroll_all(client, cols["edges"]) + edges_payloads = [p.payload or {} for p in edge_pts] - chunks = fetch_all(cl, cc) - edges = fetch_all(cl, ce) - - chunks_by_note = Counter([c.payload.get("note_id") for c in chunks]) - e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges]) - - belongs_by_note = Counter() - next_by_note = Counter() - prev_by_note = Counter() - for e in edges: - pl = e.payload - nid = pl.get("note_id") - k = pl.get("kind") or pl.get("relation") - if k == "belongs_to": - belongs_by_note[nid] += 1 - elif k == "next": - next_by_note[nid] += 1 - elif k == "prev": - prev_by_note[nid] += 1 + # 2) Summen & Klassifizierungen + edges_by_kind = _count_by_kind(edges_payloads) + explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl)) + defaults_total = sum(1 for pl in edges_payloads if _is_default(pl)) + callout_total = sum(1 for pl in edges_payloads if _is_callout(pl)) + inline_total = sum(1 for pl in edges_payloads if _is_inline(pl)) + # 3) Per-Note-Checks per_note = {} - ok_belongs = True - ok_nextprev = True - for nid, ccount in chunks_by_note.items(): - b = belongs_by_note[nid] - n = next_by_note[nid] - p = prev_by_note[nid] - per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": {"belongs_to_equals_chunks": (b == ccount), "next_prev_match": (n == p == max(ccount-1, 0))}} - ok_belongs &= (b == ccount) - ok_nextprev &= (n == p == max(ccount-1, 0)) + # chunks je Note + chunk_counts: Dict[str, int] = defaultdict(int) + for ch in _scroll_all(client, cols["chunks"]): + nid = (ch.payload or {}).get("note_id") + if nid: + chunk_counts[nid] += 1 + + # edges je Note + edges_by_note: Dict[str, list] = defaultdict(list) + for pl in edges_payloads: + nid = pl.get("note_id") + if nid: + edges_by_note[nid].append(pl) - explicit = defaults = callout = inline = 0 multi_callout_detected = False - callout_key_counts = Counter() - dup_keys = set() - seen = set() + dup_seen = set() + has_duplicates = False - for e in edges: - pl = e.payload - rule = (pl.get("rule_id") or "") - kind = pl.get("kind") or pl.get("relation") - cid = pl.get("chunk_id") - sid = pl.get("source_id"); tid = pl.get("target_id"); rel = kind - key = (sid, tid, rel, rule) - if key in seen: - dup_keys.add(key) - else: - seen.add(key) + for nid, pls in edges_by_note.items(): + by_kind = Counter(_rel(pl) for pl in pls) + belongs_to = by_kind.get("belongs_to", 0) + next_cnt = by_kind.get("next", 0) + prev_cnt = by_kind.get("prev", 0) + chunks = chunk_counts.get(nid, 0) - if is_callout_rule(rule): - callout += 1 - callout_key_counts[(cid, kind, rule)] += 1 - if rule.startswith("inline:rel:v1"): - inline += 1 - if rule.startswith("edge_defaults:"): - defaults += 1 - if rule.startswith("explicit:"): - explicit += 1 + # Duplikate + for pl in pls: + key = ( + str(pl.get("source_id") or ""), + str(pl.get("target_id") or ""), + str(_rel(pl)), + str(pl.get("rule_id") or ""), + ) + if key in dup_seen: + has_duplicates = True + dup_seen.add(key) - for _, cnt in callout_key_counts.items(): - if cnt >= 2: + # Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets + call_key_counter = Counter( + (pl.get("chunk_id"), _rel(pl), pl.get("rule_id")) + for pl in pls + if _is_callout(pl) + ) + if any(v >= 2 for v in call_key_counter.values()): multi_callout_detected = True - break - report = { - "prefix": cfg.prefix, + per_note[nid] = { + "chunks": chunks, + "belongs_to": belongs_to, + "next": next_cnt, + "prev": prev_cnt, + "checks": { + "belongs_to_equals_chunks": (belongs_to == chunks), + "next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)), + }, + } + + out = { + "prefix": prefix, "counts": { - "notes": notes_cnt, - "chunks": chunks_cnt, - "edges": edges_cnt, - "edges_by_kind": dict(e_by_kind), - "explicit_total": explicit, - "defaults_total": defaults, - "callout_total": callout, - "inline_total": inline, + "notes": client.count(collection_name=cols["notes"], exact=True).count, + "chunks": client.count(collection_name=cols["chunks"], exact=True).count, + "edges": client.count(collection_name=cols["edges"], exact=True).count, + "edges_by_kind": edges_by_kind, + "explicit_total": explicit_total, + "defaults_total": defaults_total, + "callout_total": callout_total, + "inline_total": inline_total, }, "per_note_checks": per_note, "multi_callout_detected": multi_callout_detected, - "has_duplicates": (len(dup_keys) > 0), + "has_duplicates": has_duplicates, } - print(json.dumps(report, ensure_ascii=False, indent=2)) + print(json.dumps(out, ensure_ascii=False, indent=2)) if __name__ == "__main__":