scripts/edges_full_check.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s

This commit is contained in:
Lars 2025-11-17 16:04:21 +01:00
parent c691123d2d
commit 3e08c8347e

View File

@ -3,123 +3,160 @@
from __future__ import annotations from __future__ import annotations
import json import json
from collections import Counter import os
from collections import Counter, defaultdict
from typing import Dict, Tuple
from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client from app.core.qdrant import QdrantConfig, get_client
def fetch_all(client, col): def _rel(payload: dict) -> str:
return payload.get("relation") or payload.get("kind") or "edge"
def _count_by_kind(edges_payloads):
c = Counter()
for pl in edges_payloads:
c[_rel(pl)] += 1
return dict(c)
def _is_explicit(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:")
def _is_default(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("edge_defaults:")
def _is_callout(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("callout:")
def _is_inline(pl: dict) -> bool:
rid = (pl.get("rule_id") or "").lower()
return rid.startswith("inline:")
def _scroll_all(client, col_name: str):
points = [] points = []
next_offset = None next_page = None
while True: while True:
res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset) res, next_page = client.scroll(
batch = res[0] collection_name=col_name,
next_offset = res[1] with_payload=True,
points.extend(batch) with_vectors=False,
if not next_offset: limit=2048,
offset=next_page,
)
points.extend(res)
if next_page is None:
break break
return points return points
def is_callout_rule(rule_id: str) -> bool:
if not rule_id:
return False
r = rule_id.lower()
return r.startswith("callout:edge:v1") or ("callout" in r)
def main(): def main():
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
cl = get_client(cfg) client = get_client(cfg)
prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix)
cn = f"{cfg.prefix}_notes" cols = {
cc = f"{cfg.prefix}_chunks" "notes": f"{prefix}_notes",
ce = f"{cfg.prefix}_edges" "chunks": f"{prefix}_chunks",
"edges": f"{prefix}_edges",
}
notes_cnt = cl.count(collection_name=cn, exact=True).count # 1) Alle Edges lesen
chunks_cnt = cl.count(collection_name=cc, exact=True).count edge_pts = _scroll_all(client, cols["edges"])
edges_cnt = cl.count(collection_name=ce, exact=True).count edges_payloads = [p.payload or {} for p in edge_pts]
chunks = fetch_all(cl, cc) # 2) Summen & Klassifizierungen
edges = fetch_all(cl, ce) edges_by_kind = _count_by_kind(edges_payloads)
explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl))
chunks_by_note = Counter([c.payload.get("note_id") for c in chunks]) defaults_total = sum(1 for pl in edges_payloads if _is_default(pl))
e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges]) callout_total = sum(1 for pl in edges_payloads if _is_callout(pl))
inline_total = sum(1 for pl in edges_payloads if _is_inline(pl))
belongs_by_note = Counter()
next_by_note = Counter()
prev_by_note = Counter()
for e in edges:
pl = e.payload
nid = pl.get("note_id")
k = pl.get("kind") or pl.get("relation")
if k == "belongs_to":
belongs_by_note[nid] += 1
elif k == "next":
next_by_note[nid] += 1
elif k == "prev":
prev_by_note[nid] += 1
# 3) Per-Note-Checks
per_note = {} per_note = {}
ok_belongs = True # chunks je Note
ok_nextprev = True chunk_counts: Dict[str, int] = defaultdict(int)
for nid, ccount in chunks_by_note.items(): for ch in _scroll_all(client, cols["chunks"]):
b = belongs_by_note[nid] nid = (ch.payload or {}).get("note_id")
n = next_by_note[nid] if nid:
p = prev_by_note[nid] chunk_counts[nid] += 1
per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": {"belongs_to_equals_chunks": (b == ccount), "next_prev_match": (n == p == max(ccount-1, 0))}}
ok_belongs &= (b == ccount) # edges je Note
ok_nextprev &= (n == p == max(ccount-1, 0)) edges_by_note: Dict[str, list] = defaultdict(list)
for pl in edges_payloads:
nid = pl.get("note_id")
if nid:
edges_by_note[nid].append(pl)
explicit = defaults = callout = inline = 0
multi_callout_detected = False multi_callout_detected = False
callout_key_counts = Counter() dup_seen = set()
dup_keys = set() has_duplicates = False
seen = set()
for e in edges: for nid, pls in edges_by_note.items():
pl = e.payload by_kind = Counter(_rel(pl) for pl in pls)
rule = (pl.get("rule_id") or "") belongs_to = by_kind.get("belongs_to", 0)
kind = pl.get("kind") or pl.get("relation") next_cnt = by_kind.get("next", 0)
cid = pl.get("chunk_id") prev_cnt = by_kind.get("prev", 0)
sid = pl.get("source_id"); tid = pl.get("target_id"); rel = kind chunks = chunk_counts.get(nid, 0)
key = (sid, tid, rel, rule)
if key in seen:
dup_keys.add(key)
else:
seen.add(key)
if is_callout_rule(rule): # Duplikate
callout += 1 for pl in pls:
callout_key_counts[(cid, kind, rule)] += 1 key = (
if rule.startswith("inline:rel:v1"): str(pl.get("source_id") or ""),
inline += 1 str(pl.get("target_id") or ""),
if rule.startswith("edge_defaults:"): str(_rel(pl)),
defaults += 1 str(pl.get("rule_id") or ""),
if rule.startswith("explicit:"): )
explicit += 1 if key in dup_seen:
has_duplicates = True
dup_seen.add(key)
for _, cnt in callout_key_counts.items(): # Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets
if cnt >= 2: call_key_counter = Counter(
(pl.get("chunk_id"), _rel(pl), pl.get("rule_id"))
for pl in pls
if _is_callout(pl)
)
if any(v >= 2 for v in call_key_counter.values()):
multi_callout_detected = True multi_callout_detected = True
break
report = { per_note[nid] = {
"prefix": cfg.prefix, "chunks": chunks,
"belongs_to": belongs_to,
"next": next_cnt,
"prev": prev_cnt,
"checks": {
"belongs_to_equals_chunks": (belongs_to == chunks),
"next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)),
},
}
out = {
"prefix": prefix,
"counts": { "counts": {
"notes": notes_cnt, "notes": client.count(collection_name=cols["notes"], exact=True).count,
"chunks": chunks_cnt, "chunks": client.count(collection_name=cols["chunks"], exact=True).count,
"edges": edges_cnt, "edges": client.count(collection_name=cols["edges"], exact=True).count,
"edges_by_kind": dict(e_by_kind), "edges_by_kind": edges_by_kind,
"explicit_total": explicit, "explicit_total": explicit_total,
"defaults_total": defaults, "defaults_total": defaults_total,
"callout_total": callout, "callout_total": callout_total,
"inline_total": inline, "inline_total": inline_total,
}, },
"per_note_checks": per_note, "per_note_checks": per_note,
"multi_callout_detected": multi_callout_detected, "multi_callout_detected": multi_callout_detected,
"has_duplicates": (len(dup_keys) > 0), "has_duplicates": has_duplicates,
} }
print(json.dumps(report, ensure_ascii=False, indent=2)) print(json.dumps(out, ensure_ascii=False, indent=2))
if __name__ == "__main__": if __name__ == "__main__":