mindnet/scripts/edges_full_check.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/edges_full_check.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Umfassende Validierung der Edge-Struktur in Qdrant.
Analysiert Edge-Typen, Rule-Gruppen und strukturelle Integrität.

Funktionsweise:
---------------
1. Lädt alle Edges aus {prefix}_edges
2. Gruppiert Edges nach rule_id:
   - explicit: rule_id startswith "explicit:" (wikilink, note_scope)
   - callout: rule_id == "callout:edge"
   - inline: rule_id startswith "inline:" (rel)
   - defaults: rule_id startswith "edge_defaults:"
   - structure: rule_id in {"structure:belongs_to", "structure:order"}
3. Prüft strukturelle Integrität:
   - belongs_to == chunks pro Note
   - next == prev == (chunks-1) pro Note
   - Multi-Callout-Erkennung
4. Aggregiert Statistiken

Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON mit umfassender Analyse
  * counts: notes/chunks/edges Anzahlen
  * edges_by_kind: Aggregierte Edge-Anzahl pro Typ
  * rule_groups: Zählung nach Rule-Gruppen
  * per_note_checks: Strukturelle Validierung pro Note
  * multi_callout_detected: Boolean
- Exit-Code 0: Erfolgreich

Verwendung:
-----------
- Umfassende Graph-Analyse
- Validierung nach größeren Änderungen
- Debugging von Edge-Problemen

Hinweise:
---------
- Kann bei großen Graphen langsam sein
- Prüft strukturelle, nicht semantische Korrektheit

Aufruf:
-------
python3 -m scripts.edges_full_check --prefix mindnet

Parameter:
----------
--prefix TEXT   Collection-Präfix (Default: ENV COLLECTION_PREFIX oder mindnet)

Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0: Initial Release
"""

from __future__ import annotations

import json
from collections import Counter, defaultdict
from typing import Dict, Any, List, Tuple

from app.core.database.qdrant import QdrantConfig, get_client
from qdrant_client.http import models as rest


def _count_collection_points(client, name: str) -> int:
    try:
        res = client.count(collection_name=name, exact=True)
        return res.count or 0
    except Exception:
        return 0


def _scroll_all(client, collection: str) -> List[Any]:
    pts_all = []
    offset = None
    while True:
        pts, offset = client.scroll(
            collection_name=collection,
            with_payload=True,
            with_vectors=False,
            limit=2048,
            offset=offset,
        )
        pts_all.extend(pts or [])
        if offset is None:
            break
    return pts_all


def _rule_group(rule_id: str) -> str:
    if not rule_id:
        return "unknown"
    if rule_id == "callout:edge":
        return "callout"
    if rule_id.startswith("inline:"):           # <—— wichtig für "inline:rel"
        return "inline"
    if rule_id.startswith("edge_defaults:"):
        return "defaults"
    if rule_id.startswith("explicit:"):
        return "explicit"
    if rule_id in ("structure:belongs_to", "structure:order"):
        return "structure"
    return "other"


def main() -> None:
    cfg = QdrantConfig.from_env()
    client = get_client(cfg)

    col_notes = f"{cfg.prefix}_notes"
    col_chunks = f"{cfg.prefix}_chunks"
    col_edges = f"{cfg.prefix}_edges"

    # High-level counts
    notes_n = _count_collection_points(client, col_notes)
    chunks_n = _count_collection_points(client, col_chunks)
    edges_pts = _scroll_all(client, col_edges)
    edges_n = len(edges_pts)

    # By kind / by rule group
    by_kind = Counter()
    group_counts = Counter()
    callout_buckets: Dict[Tuple[str, str], int] = defaultdict(int)  # (chunk_id, kind) -> n targets
    per_note = defaultdict(lambda: {"chunks": 0, "belongs_to": 0, "next": 0, "prev": 0})

    # Für per_note checks: chunks pro note_id aus mindnet_chunks laden
    chunks_pts = _scroll_all(client, col_chunks)
    chunks_by_note = Counter([p.payload.get("note_id") for p in chunks_pts if p.payload])

    for p in edges_pts:
        pl = p.payload or {}
        kind = str(pl.get("kind") or pl.get("relation") or "edge")
        rule_id = str(pl.get("rule_id") or "")
        note_id = str(pl.get("note_id") or "")
        chunk_id = str(pl.get("chunk_id") or "")
        by_kind[kind] += 1

        group = _rule_group(rule_id)
        group_counts[group] += 1

        # Multi-Callout-Erkennung: mehrere callout-Edges gleicher Relation aus demselben Chunk
        if group == "callout" and chunk_id and kind:
            callout_buckets[(chunk_id, kind)] += 1

        # Per-note Strukturchecks
        if note_id:
            if kind == "belongs_to":
                per_note[note_id]["belongs_to"] += 1
            elif kind == "next":
                per_note[note_id]["next"] += 1
            elif kind == "prev":
                per_note[note_id]["prev"] += 1

    # set chunks count for per_note
    for n_id, c in chunks_by_note.items():
        per_note[n_id]["chunks"] = c

    # final checks per note
    per_note_checks = {}
    for n_id, stats in per_note.items():
        c = stats.get("chunks", 0)
        bt = stats.get("belongs_to", 0)
        nx = stats.get("next", 0)
        pv = stats.get("prev", 0)
        per_note_checks[n_id] = {
            "chunks": c,
            "belongs_to": bt,
            "next": nx,
            "prev": pv,
            "checks": {
                "belongs_to_equals_chunks": (bt == c),
                "next_prev_match": (nx == pv == max(c - 1, 0)),
            },
        }

    multi_callout_detected = any(v > 1 for v in callout_buckets.values())

    out = {
        "prefix": cfg.prefix,
        "counts": {
            "notes": notes_n,
            "chunks": chunks_n,
            "edges": edges_n,
            "edges_by_kind": dict(by_kind),
            "explicit_total": group_counts.get("explicit", 0),
            "defaults_total": group_counts.get("defaults", 0),
            "callout_total": group_counts.get("callout", 0),
            "inline_total": group_counts.get("inline", 0),
            "structure_total": group_counts.get("structure", 0),
        },
        "per_note_checks": per_note_checks,
        "multi_callout_detected": bool(multi_callout_detected),
        "has_duplicates": False,  # dedupe passiert beim Upsert
    }
    print(json.dumps(out, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()