#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import annotations import json import os from collections import Counter, defaultdict from typing import Dict, Tuple from qdrant_client.http import models as rest from app.core.qdrant import QdrantConfig, get_client def _rel(payload: dict) -> str: return payload.get("relation") or payload.get("kind") or "edge" def _count_by_kind(edges_payloads): c = Counter() for pl in edges_payloads: c[_rel(pl)] += 1 return dict(c) def _is_explicit(pl: dict) -> bool: rid = (pl.get("rule_id") or "").lower() return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:") def _is_default(pl: dict) -> bool: rid = (pl.get("rule_id") or "").lower() return rid.startswith("edge_defaults:") def _is_callout(pl: dict) -> bool: rid = (pl.get("rule_id") or "").lower() return rid.startswith("callout:") def _is_inline(pl: dict) -> bool: rid = (pl.get("rule_id") or "").lower() return rid.startswith("inline:") def _scroll_all(client, col_name: str): points = [] next_page = None while True: res, next_page = client.scroll( collection_name=col_name, with_payload=True, with_vectors=False, limit=2048, offset=next_page, ) points.extend(res) if next_page is None: break return points def main(): cfg = QdrantConfig.from_env() client = get_client(cfg) prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix) cols = { "notes": f"{prefix}_notes", "chunks": f"{prefix}_chunks", "edges": f"{prefix}_edges", } # 1) Alle Edges lesen edge_pts = _scroll_all(client, cols["edges"]) edges_payloads = [p.payload or {} for p in edge_pts] # 2) Summen & Klassifizierungen edges_by_kind = _count_by_kind(edges_payloads) explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl)) defaults_total = sum(1 for pl in edges_payloads if _is_default(pl)) callout_total = sum(1 for pl in edges_payloads if _is_callout(pl)) inline_total = sum(1 for pl in edges_payloads if _is_inline(pl)) # 3) Per-Note-Checks per_note = {} # chunks je Note chunk_counts: Dict[str, int] = defaultdict(int) for ch in _scroll_all(client, cols["chunks"]): nid = (ch.payload or {}).get("note_id") if nid: chunk_counts[nid] += 1 # edges je Note edges_by_note: Dict[str, list] = defaultdict(list) for pl in edges_payloads: nid = pl.get("note_id") if nid: edges_by_note[nid].append(pl) multi_callout_detected = False dup_seen = set() has_duplicates = False for nid, pls in edges_by_note.items(): by_kind = Counter(_rel(pl) for pl in pls) belongs_to = by_kind.get("belongs_to", 0) next_cnt = by_kind.get("next", 0) prev_cnt = by_kind.get("prev", 0) chunks = chunk_counts.get(nid, 0) # Duplikate for pl in pls: key = ( str(pl.get("source_id") or ""), str(pl.get("target_id") or ""), str(_rel(pl)), str(pl.get("rule_id") or ""), ) if key in dup_seen: has_duplicates = True dup_seen.add(key) # Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets call_key_counter = Counter( (pl.get("chunk_id"), _rel(pl), pl.get("rule_id")) for pl in pls if _is_callout(pl) ) if any(v >= 2 for v in call_key_counter.values()): multi_callout_detected = True per_note[nid] = { "chunks": chunks, "belongs_to": belongs_to, "next": next_cnt, "prev": prev_cnt, "checks": { "belongs_to_equals_chunks": (belongs_to == chunks), "next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)), }, } out = { "prefix": prefix, "counts": { "notes": client.count(collection_name=cols["notes"], exact=True).count, "chunks": client.count(collection_name=cols["chunks"], exact=True).count, "edges": client.count(collection_name=cols["edges"], exact=True).count, "edges_by_kind": edges_by_kind, "explicit_total": explicit_total, "defaults_total": defaults_total, "callout_total": callout_total, "inline_total": inline_total, }, "per_note_checks": per_note, "multi_callout_detected": multi_callout_detected, "has_duplicates": has_duplicates, } print(json.dumps(out, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()