#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import annotations import json import os from collections import Counter, defaultdict from qdrant_client.http import models as rest from app.core.database.qdrant import QdrantConfig, get_client def _rel(pl: dict) -> str: return pl.get("relation") or pl.get("kind") or "edge" def _scroll(client, col): pts = [] next_page = None while True: res, next_page = client.scroll( collection_name=col, with_payload=True, with_vectors=False, limit=1024, offset=next_page, ) pts.extend(res) if next_page is None: break return pts def main(): cfg = QdrantConfig.from_env() client = get_client(cfg) prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix) cols = { "notes": f"{prefix}_notes", "chunks": f"{prefix}_chunks", "edges": f"{prefix}_edges", } # Index: notes -> title/type notes_meta = {} for p in _scroll(client, cols["notes"]): pl = p.payload or {} nid = pl.get("note_id") if nid: notes_meta[nid] = { "title": pl.get("title", ""), "type": pl.get("type", ""), } # chunks je note chunks_by_note = defaultdict(int) for p in _scroll(client, cols["chunks"]): pl = p.payload or {} nid = pl.get("note_id") if nid: chunks_by_note[nid] += 1 # edges je note edges_by_note = defaultdict(list) edges_all = _scroll(client, cols["edges"]) for p in edges_all: pl = p.payload or {} nid = pl.get("note_id") if nid: edges_by_note[nid].append(pl) # pro note ausgeben summary_edges = Counter() total_chunks = 0 for nid in sorted(notes_meta.keys()): meta = notes_meta[nid] chunks = chunks_by_note.get(nid, 0) total_chunks += chunks kinds = Counter(_rel(pl) for pl in edges_by_note[nid]) summary_edges.update(kinds) row = { "note_id": nid, "title": meta["title"], "type": meta["type"], "chunks": chunks, "edges_by_kind": dict(kinds), "checks": { "belongs_to_equals_chunks": (kinds.get("belongs_to", 0) == chunks), "next_prev_match": (kinds.get("next", 0) == kinds.get("prev", 0) == max(0, chunks - 1)), "no_duplicate_edges": _no_dupes(edges_by_note[nid]), }, } print(json.dumps(row, ensure_ascii=False)) # Gesamtsummary total_notes = len(notes_meta) out = { "prefix": prefix, "summary": { "notes": total_notes, "chunks": total_chunks, "belongs_to": summary_edges.get("belongs_to", 0), "next": summary_edges.get("next", 0), "prev": summary_edges.get("prev", 0), "refs_chunk": summary_edges.get("references", 0), "refs_note": summary_edges.get("references_note", 0), "backlink": summary_edges.get("backlink", 0), "dup_edges": 0, # per-Note geprüft }, } print(json.dumps(out, ensure_ascii=False)) def _no_dupes(pls): seen = set() for pl in pls: key = ( str(pl.get("source_id") or ""), str(pl.get("target_id") or ""), str(pl.get("relation") or pl.get("kind") or ""), str(pl.get("rule_id") or ""), ) if key in seen: return False seen.add(key) return True if __name__ == "__main__": main()