#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ scripts/edges_full_check.py Zählt und validiert Kanten in Qdrant. Erkennt folgende Rule-Gruppen: - explicit_total: rule_id startswith "explicit:" (z.B. explicit:wikilink, explicit:note_scope) - callout_total: rule_id == "callout:edge" - inline_total: rule_id startswith "inline:" (z.B. inline:rel) - defaults_total: rule_id startswith "edge_defaults:" - structure: rule_id in {"structure:belongs_to","structure:order"} Gibt zusätzlich: - edges_by_kind (aggregiert) - notes/chunks/edges Anzahlen - multi_callout_detected: True, falls ein Chunk mehrere Callout-Ziele der gleichen Relation enthält - per_note_checks: belongs_to == chunks, next == prev == (chunks-1) """ from __future__ import annotations import json from collections import Counter, defaultdict from typing import Dict, Any, List, Tuple from app.core.qdrant import QdrantConfig, get_client from qdrant_client.http import models as rest def _count_collection_points(client, name: str) -> int: try: res = client.count(collection_name=name, exact=True) return res.count or 0 except Exception: return 0 def _scroll_all(client, collection: str) -> List[Any]: pts_all = [] offset = None while True: pts, offset = client.scroll( collection_name=collection, with_payload=True, with_vectors=False, limit=2048, offset=offset, ) pts_all.extend(pts or []) if offset is None: break return pts_all def _rule_group(rule_id: str) -> str: if not rule_id: return "unknown" if rule_id == "callout:edge": return "callout" if rule_id.startswith("inline:"): # <—— wichtig für "inline:rel" return "inline" if rule_id.startswith("edge_defaults:"): return "defaults" if rule_id.startswith("explicit:"): return "explicit" if rule_id in ("structure:belongs_to", "structure:order"): return "structure" return "other" def main() -> None: cfg = QdrantConfig.from_env() client = get_client(cfg) col_notes = f"{cfg.prefix}_notes" col_chunks = f"{cfg.prefix}_chunks" col_edges = f"{cfg.prefix}_edges" # High-level counts notes_n = _count_collection_points(client, col_notes) chunks_n = _count_collection_points(client, col_chunks) edges_pts = _scroll_all(client, col_edges) edges_n = len(edges_pts) # By kind / by rule group by_kind = Counter() group_counts = Counter() callout_buckets: Dict[Tuple[str, str], int] = defaultdict(int) # (chunk_id, kind) -> n targets per_note = defaultdict(lambda: {"chunks": 0, "belongs_to": 0, "next": 0, "prev": 0}) # Für per_note checks: chunks pro note_id aus mindnet_chunks laden chunks_pts = _scroll_all(client, col_chunks) chunks_by_note = Counter([p.payload.get("note_id") for p in chunks_pts if p.payload]) for p in edges_pts: pl = p.payload or {} kind = str(pl.get("kind") or pl.get("relation") or "edge") rule_id = str(pl.get("rule_id") or "") note_id = str(pl.get("note_id") or "") chunk_id = str(pl.get("chunk_id") or "") by_kind[kind] += 1 group = _rule_group(rule_id) group_counts[group] += 1 # Multi-Callout-Erkennung: mehrere callout-Edges gleicher Relation aus demselben Chunk if group == "callout" and chunk_id and kind: callout_buckets[(chunk_id, kind)] += 1 # Per-note Strukturchecks if note_id: if kind == "belongs_to": per_note[note_id]["belongs_to"] += 1 elif kind == "next": per_note[note_id]["next"] += 1 elif kind == "prev": per_note[note_id]["prev"] += 1 # set chunks count for per_note for n_id, c in chunks_by_note.items(): per_note[n_id]["chunks"] = c # final checks per note per_note_checks = {} for n_id, stats in per_note.items(): c = stats.get("chunks", 0) bt = stats.get("belongs_to", 0) nx = stats.get("next", 0) pv = stats.get("prev", 0) per_note_checks[n_id] = { "chunks": c, "belongs_to": bt, "next": nx, "prev": pv, "checks": { "belongs_to_equals_chunks": (bt == c), "next_prev_match": (nx == pv == max(c - 1, 0)), }, } multi_callout_detected = any(v > 1 for v in callout_buckets.values()) out = { "prefix": cfg.prefix, "counts": { "notes": notes_n, "chunks": chunks_n, "edges": edges_n, "edges_by_kind": dict(by_kind), "explicit_total": group_counts.get("explicit", 0), "defaults_total": group_counts.get("defaults", 0), "callout_total": group_counts.get("callout", 0), "inline_total": group_counts.get("inline", 0), "structure_total": group_counts.get("structure", 0), }, "per_note_checks": per_note_checks, "multi_callout_detected": bool(multi_callout_detected), "has_duplicates": False, # dedupe passiert beim Upsert } print(json.dumps(out, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()