scripts/edges_full_check.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-17 16:37:00 +01:00
parent fd215c18e4
commit 46b26c9624

View File

@ -1,160 +1,160 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
scripts/edges_full_check.py
Zählt und validiert Kanten in Qdrant. Erkennt folgende Rule-Gruppen:
- explicit_total: rule_id startswith "explicit:" (z.B. explicit:wikilink, explicit:note_scope)
- callout_total: rule_id == "callout:edge"
- inline_total: rule_id startswith "inline:" (z.B. inline:rel)
- defaults_total: rule_id startswith "edge_defaults:"
- structure: rule_id in {"structure:belongs_to","structure:order"}
Gibt zusätzlich:
- edges_by_kind (aggregiert)
- notes/chunks/edges Anzahlen
- multi_callout_detected: True, falls ein Chunk mehrere Callout-Ziele der gleichen Relation enthält
- per_note_checks: belongs_to == chunks, next == prev == (chunks-1)
"""
from __future__ import annotations from __future__ import annotations
import json import json
import os
from collections import Counter, defaultdict from collections import Counter, defaultdict
from typing import Dict, Tuple from typing import Dict, Any, List, Tuple
from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client from app.core.qdrant import QdrantConfig, get_client
from qdrant_client.http import models as rest
def _rel(payload: dict) -> str: def _count_collection_points(client, name: str) -> int:
return payload.get("relation") or payload.get("kind") or "edge" try:
res = client.count(collection_name=name, exact=True)
return res.count or 0
except Exception:
return 0
def _count_by_kind(edges_payloads): def _scroll_all(client, collection: str) -> List[Any]:
c = Counter() pts_all = []
for pl in edges_payloads: offset = None
c[_rel(pl)] += 1
return dict(c)
def _is_explicit(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:")
def _is_default(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("edge_defaults:")
def _is_callout(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("callout:")
def _is_inline(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("inline:")
def _scroll_all(client, col_name: str):
points = []
next_page = None
while True: while True:
res, next_page = client.scroll( pts, offset = client.scroll(
collection_name=col_name, collection_name=collection,
with_payload=True, with_payload=True,
with_vectors=False, with_vectors=False,
limit=2048, limit=2048,
offset=next_page, offset=offset,
) )
points.extend(res) pts_all.extend(pts or [])
if next_page is None: if offset is None:
break break
return points return pts_all
def main(): def _rule_group(rule_id: str) -> str:
if not rule_id:
return "unknown"
if rule_id == "callout:edge":
return "callout"
if rule_id.startswith("inline:"): # <—— wichtig für "inline:rel"
return "inline"
if rule_id.startswith("edge_defaults:"):
return "defaults"
if rule_id.startswith("explicit:"):
return "explicit"
if rule_id in ("structure:belongs_to", "structure:order"):
return "structure"
return "other"
def main() -> None:
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
client = get_client(cfg) client = get_client(cfg)
prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix)
cols = { col_notes = f"{cfg.prefix}_notes"
"notes": f"{prefix}_notes", col_chunks = f"{cfg.prefix}_chunks"
"chunks": f"{prefix}_chunks", col_edges = f"{cfg.prefix}_edges"
"edges": f"{prefix}_edges",
}
# 1) Alle Edges lesen # High-level counts
edge_pts = _scroll_all(client, cols["edges"]) notes_n = _count_collection_points(client, col_notes)
edges_payloads = [p.payload or {} for p in edge_pts] chunks_n = _count_collection_points(client, col_chunks)
edges_pts = _scroll_all(client, col_edges)
edges_n = len(edges_pts)
# 2) Summen & Klassifizierungen # By kind / by rule group
edges_by_kind = _count_by_kind(edges_payloads) by_kind = Counter()
explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl)) group_counts = Counter()
defaults_total = sum(1 for pl in edges_payloads if _is_default(pl)) callout_buckets: Dict[Tuple[str, str], int] = defaultdict(int) # (chunk_id, kind) -> n targets
callout_total = sum(1 for pl in edges_payloads if _is_callout(pl)) per_note = defaultdict(lambda: {"chunks": 0, "belongs_to": 0, "next": 0, "prev": 0})
inline_total = sum(1 for pl in edges_payloads if _is_inline(pl))
# 3) Per-Note-Checks # Für per_note checks: chunks pro note_id aus mindnet_chunks laden
per_note = {} chunks_pts = _scroll_all(client, col_chunks)
# chunks je Note chunks_by_note = Counter([p.payload.get("note_id") for p in chunks_pts if p.payload])
chunk_counts: Dict[str, int] = defaultdict(int)
for ch in _scroll_all(client, cols["chunks"]):
nid = (ch.payload or {}).get("note_id")
if nid:
chunk_counts[nid] += 1
# edges je Note for p in edges_pts:
edges_by_note: Dict[str, list] = defaultdict(list) pl = p.payload or {}
for pl in edges_payloads: kind = str(pl.get("kind") or pl.get("relation") or "edge")
nid = pl.get("note_id") rule_id = str(pl.get("rule_id") or "")
if nid: note_id = str(pl.get("note_id") or "")
edges_by_note[nid].append(pl) chunk_id = str(pl.get("chunk_id") or "")
by_kind[kind] += 1
multi_callout_detected = False group = _rule_group(rule_id)
dup_seen = set() group_counts[group] += 1
has_duplicates = False
for nid, pls in edges_by_note.items(): # Multi-Callout-Erkennung: mehrere callout-Edges gleicher Relation aus demselben Chunk
by_kind = Counter(_rel(pl) for pl in pls) if group == "callout" and chunk_id and kind:
belongs_to = by_kind.get("belongs_to", 0) callout_buckets[(chunk_id, kind)] += 1
next_cnt = by_kind.get("next", 0)
prev_cnt = by_kind.get("prev", 0)
chunks = chunk_counts.get(nid, 0)
# Duplikate # Per-note Strukturchecks
for pl in pls: if note_id:
key = ( if kind == "belongs_to":
str(pl.get("source_id") or ""), per_note[note_id]["belongs_to"] += 1
str(pl.get("target_id") or ""), elif kind == "next":
str(_rel(pl)), per_note[note_id]["next"] += 1
str(pl.get("rule_id") or ""), elif kind == "prev":
) per_note[note_id]["prev"] += 1
if key in dup_seen:
has_duplicates = True
dup_seen.add(key)
# Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets # set chunks count for per_note
call_key_counter = Counter( for n_id, c in chunks_by_note.items():
(pl.get("chunk_id"), _rel(pl), pl.get("rule_id")) per_note[n_id]["chunks"] = c
for pl in pls
if _is_callout(pl)
)
if any(v >= 2 for v in call_key_counter.values()):
multi_callout_detected = True
per_note[nid] = { # final checks per note
"chunks": chunks, per_note_checks = {}
"belongs_to": belongs_to, for n_id, stats in per_note.items():
"next": next_cnt, c = stats.get("chunks", 0)
"prev": prev_cnt, bt = stats.get("belongs_to", 0)
nx = stats.get("next", 0)
pv = stats.get("prev", 0)
per_note_checks[n_id] = {
"chunks": c,
"belongs_to": bt,
"next": nx,
"prev": pv,
"checks": { "checks": {
"belongs_to_equals_chunks": (belongs_to == chunks), "belongs_to_equals_chunks": (bt == c),
"next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)), "next_prev_match": (nx == pv == max(c - 1, 0)),
}, },
} }
multi_callout_detected = any(v > 1 for v in callout_buckets.values())
out = { out = {
"prefix": prefix, "prefix": cfg.prefix,
"counts": { "counts": {
"notes": client.count(collection_name=cols["notes"], exact=True).count, "notes": notes_n,
"chunks": client.count(collection_name=cols["chunks"], exact=True).count, "chunks": chunks_n,
"edges": client.count(collection_name=cols["edges"], exact=True).count, "edges": edges_n,
"edges_by_kind": edges_by_kind, "edges_by_kind": dict(by_kind),
"explicit_total": explicit_total, "explicit_total": group_counts.get("explicit", 0),
"defaults_total": defaults_total, "defaults_total": group_counts.get("defaults", 0),
"callout_total": callout_total, "callout_total": group_counts.get("callout", 0),
"inline_total": inline_total, "inline_total": group_counts.get("inline", 0),
"structure_total": group_counts.get("structure", 0),
}, },
"per_note_checks": per_note, "per_note_checks": per_note_checks,
"multi_callout_detected": multi_callout_detected, "multi_callout_detected": bool(multi_callout_detected),
"has_duplicates": has_duplicates, "has_duplicates": False, # dedupe passiert beim Upsert
} }
print(json.dumps(out, ensure_ascii=False, indent=2)) print(json.dumps(out, ensure_ascii=False, indent=2))