mindnet/tests/test_edges_smoke.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
scripts/test_edges_smoke.py

Integritäts-Check für mindnet-Edges in Qdrant.
Prüft pro Note:
- Chunk-Anzahl (mindnet_chunks) = belongs_to-Kanten
- next/prev-Kanten: jeweils (#Chunks - 1)
- Dedupe: kein Duplikat (key=(kind,source_id,target_id,scope))
- references (chunk-scope): vorhanden, wenn Wikilinks erwartet werden (nur Zählreport)
- optional note-scope references/backlink: vorhanden, wenn --note-scope-refs genutzt wurde

Ausgabe: JSON pro Note + Gesamtsummary.
"""

from __future__ import annotations
import json, os, sys
from typing import Dict, Any, List, Tuple, Set
from qdrant_client.http import models as rest

from app.core.qdrant import QdrantConfig, get_client

def collections(prefix: str) -> Tuple[str, str, str]:
    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"

def scroll_ids(client, collection: str, filt: rest.Filter | None = None, payload=False, limit=256):
    next_page = None
    while True:
        pts, next_page = client.scroll(
            collection_name=collection,
            scroll_filter=filt,
            with_payload=payload,
            with_vectors=False,
            limit=limit,
            offset=next_page,
        )
        if not pts:
            break
        for p in pts:
            yield p

def list_notes(client, prefix: str) -> List[Dict[str, Any]]:
    notes_col, _, _ = collections(prefix)
    out = []
    for p in scroll_ids(client, notes_col, None, payload=True):
        pl = p.payload or {}
        nid = pl.get("note_id") or pl.get("id")
        if nid:
            out.append({
                "note_id": nid,
                "title": pl.get("title"),
                "type": pl.get("type"),
            })
    return out

def count_chunks_for_note(client, prefix: str, note_id: str) -> int:
    _, chunks_col, _ = collections(prefix)
    filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
    return sum(1 for _ in scroll_ids(client, chunks_col, filt, payload=False))

def fetch_edges_for_note(client, prefix: str, note_id: str) -> List[Dict[str, Any]]:
    _, _, edges_col = collections(prefix)
    filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
    return [p.payload or {} for p in scroll_ids(client, edges_col, filt, payload=True)]

def main():
    cfg = QdrantConfig.from_env()
    client = get_client(cfg)
    notes = list_notes(client, cfg.prefix)

    report = []
    total = {"notes": 0, "chunks": 0, "belongs_to": 0, "next": 0, "prev": 0, "refs_chunk": 0, "refs_note": 0, "backlink": 0, "dup_edges": 0}
    for n in notes:
        nid = n["note_id"]
        total["notes"] += 1
        chunk_count = count_chunks_for_note(client, cfg.prefix, nid)
        total["chunks"] += chunk_count

        edges = fetch_edges_for_note(client, cfg.prefix, nid)
        by_kind = {}
        keys: Set[tuple] = set()
        dup_count = 0
        for e in edges:
            k = e.get("kind")
            by_kind[k] = by_kind.get(k, 0) + 1
            t = (e.get("kind"), e.get("source_id"), e.get("target_id"), e.get("scope"))
            if t in keys:
                dup_count += 1
            else:
                keys.add(t)

        bt = by_kind.get("belongs_to", 0)
        nx = by_kind.get("next", 0)
        pv = by_kind.get("prev", 0)
        rc = by_kind.get("references", 0) if any(e.get("scope") == "chunk" and e.get("kind") == "references" for e in edges) else 0
        rn = sum(1 for e in edges if e.get("scope") == "note" and e.get("kind") == "references")
        bl = by_kind.get("backlink", 0)

        total["belongs_to"] += bt
        total["next"] += nx
        total["prev"] += pv
        total["refs_chunk"] += rc
        total["refs_note"] += rn
        total["backlink"] += bl
        total["dup_edges"] += dup_count

        ok_bt = (bt == chunk_count)
        ok_seq = (nx == max(chunk_count - 1, 0) and pv == max(chunk_count - 1, 0))
        ok_dup = (dup_count == 0)

        report.append({
            "note_id": nid,
            "title": n.get("title"),
            "type": n.get("type"),
            "chunks": chunk_count,
            "edges_by_kind": by_kind,
            "checks": {
                "belongs_to_equals_chunks": ok_bt,
                "next_prev_match": ok_seq,
                "no_duplicate_edges": ok_dup,
            }
        })

    out = {"prefix": cfg.prefix, "summary": total, "notes": report}
    print(json.dumps(out, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()