scripts/edges_full_check.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
This commit is contained in:
parent
c691123d2d
commit
3e08c8347e
|
|
@ -3,123 +3,160 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from collections import Counter
|
import os
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
from app.core.qdrant import QdrantConfig, get_client
|
from app.core.qdrant import QdrantConfig, get_client
|
||||||
|
|
||||||
|
|
||||||
def fetch_all(client, col):
|
def _rel(payload: dict) -> str:
|
||||||
|
return payload.get("relation") or payload.get("kind") or "edge"
|
||||||
|
|
||||||
|
|
||||||
|
def _count_by_kind(edges_payloads):
|
||||||
|
c = Counter()
|
||||||
|
for pl in edges_payloads:
|
||||||
|
c[_rel(pl)] += 1
|
||||||
|
return dict(c)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_explicit(pl: dict) -> bool:
|
||||||
|
rid = (pl.get("rule_id") or "").lower()
|
||||||
|
return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_default(pl: dict) -> bool:
|
||||||
|
rid = (pl.get("rule_id") or "").lower()
|
||||||
|
return rid.startswith("edge_defaults:")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_callout(pl: dict) -> bool:
|
||||||
|
rid = (pl.get("rule_id") or "").lower()
|
||||||
|
return rid.startswith("callout:")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_inline(pl: dict) -> bool:
|
||||||
|
rid = (pl.get("rule_id") or "").lower()
|
||||||
|
return rid.startswith("inline:")
|
||||||
|
|
||||||
|
|
||||||
|
def _scroll_all(client, col_name: str):
|
||||||
points = []
|
points = []
|
||||||
next_offset = None
|
next_page = None
|
||||||
while True:
|
while True:
|
||||||
res = client.scroll(collection_name=col, with_payload=True, with_vectors=False, limit=2048, offset=next_offset)
|
res, next_page = client.scroll(
|
||||||
batch = res[0]
|
collection_name=col_name,
|
||||||
next_offset = res[1]
|
with_payload=True,
|
||||||
points.extend(batch)
|
with_vectors=False,
|
||||||
if not next_offset:
|
limit=2048,
|
||||||
|
offset=next_page,
|
||||||
|
)
|
||||||
|
points.extend(res)
|
||||||
|
if next_page is None:
|
||||||
break
|
break
|
||||||
return points
|
return points
|
||||||
|
|
||||||
|
|
||||||
def is_callout_rule(rule_id: str) -> bool:
|
|
||||||
if not rule_id:
|
|
||||||
return False
|
|
||||||
r = rule_id.lower()
|
|
||||||
return r.startswith("callout:edge:v1") or ("callout" in r)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = QdrantConfig.from_env()
|
cfg = QdrantConfig.from_env()
|
||||||
cl = get_client(cfg)
|
client = get_client(cfg)
|
||||||
|
prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix)
|
||||||
|
|
||||||
cn = f"{cfg.prefix}_notes"
|
cols = {
|
||||||
cc = f"{cfg.prefix}_chunks"
|
"notes": f"{prefix}_notes",
|
||||||
ce = f"{cfg.prefix}_edges"
|
"chunks": f"{prefix}_chunks",
|
||||||
|
"edges": f"{prefix}_edges",
|
||||||
|
}
|
||||||
|
|
||||||
notes_cnt = cl.count(collection_name=cn, exact=True).count
|
# 1) Alle Edges lesen
|
||||||
chunks_cnt = cl.count(collection_name=cc, exact=True).count
|
edge_pts = _scroll_all(client, cols["edges"])
|
||||||
edges_cnt = cl.count(collection_name=ce, exact=True).count
|
edges_payloads = [p.payload or {} for p in edge_pts]
|
||||||
|
|
||||||
chunks = fetch_all(cl, cc)
|
# 2) Summen & Klassifizierungen
|
||||||
edges = fetch_all(cl, ce)
|
edges_by_kind = _count_by_kind(edges_payloads)
|
||||||
|
explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl))
|
||||||
chunks_by_note = Counter([c.payload.get("note_id") for c in chunks])
|
defaults_total = sum(1 for pl in edges_payloads if _is_default(pl))
|
||||||
e_by_kind = Counter([e.payload.get("kind") or e.payload.get("relation") for e in edges])
|
callout_total = sum(1 for pl in edges_payloads if _is_callout(pl))
|
||||||
|
inline_total = sum(1 for pl in edges_payloads if _is_inline(pl))
|
||||||
belongs_by_note = Counter()
|
|
||||||
next_by_note = Counter()
|
|
||||||
prev_by_note = Counter()
|
|
||||||
for e in edges:
|
|
||||||
pl = e.payload
|
|
||||||
nid = pl.get("note_id")
|
|
||||||
k = pl.get("kind") or pl.get("relation")
|
|
||||||
if k == "belongs_to":
|
|
||||||
belongs_by_note[nid] += 1
|
|
||||||
elif k == "next":
|
|
||||||
next_by_note[nid] += 1
|
|
||||||
elif k == "prev":
|
|
||||||
prev_by_note[nid] += 1
|
|
||||||
|
|
||||||
|
# 3) Per-Note-Checks
|
||||||
per_note = {}
|
per_note = {}
|
||||||
ok_belongs = True
|
# chunks je Note
|
||||||
ok_nextprev = True
|
chunk_counts: Dict[str, int] = defaultdict(int)
|
||||||
for nid, ccount in chunks_by_note.items():
|
for ch in _scroll_all(client, cols["chunks"]):
|
||||||
b = belongs_by_note[nid]
|
nid = (ch.payload or {}).get("note_id")
|
||||||
n = next_by_note[nid]
|
if nid:
|
||||||
p = prev_by_note[nid]
|
chunk_counts[nid] += 1
|
||||||
per_note[nid] = {"chunks": ccount, "belongs_to": b, "next": n, "prev": p, "checks": {"belongs_to_equals_chunks": (b == ccount), "next_prev_match": (n == p == max(ccount-1, 0))}}
|
|
||||||
ok_belongs &= (b == ccount)
|
# edges je Note
|
||||||
ok_nextprev &= (n == p == max(ccount-1, 0))
|
edges_by_note: Dict[str, list] = defaultdict(list)
|
||||||
|
for pl in edges_payloads:
|
||||||
|
nid = pl.get("note_id")
|
||||||
|
if nid:
|
||||||
|
edges_by_note[nid].append(pl)
|
||||||
|
|
||||||
explicit = defaults = callout = inline = 0
|
|
||||||
multi_callout_detected = False
|
multi_callout_detected = False
|
||||||
callout_key_counts = Counter()
|
dup_seen = set()
|
||||||
dup_keys = set()
|
has_duplicates = False
|
||||||
seen = set()
|
|
||||||
|
|
||||||
for e in edges:
|
for nid, pls in edges_by_note.items():
|
||||||
pl = e.payload
|
by_kind = Counter(_rel(pl) for pl in pls)
|
||||||
rule = (pl.get("rule_id") or "")
|
belongs_to = by_kind.get("belongs_to", 0)
|
||||||
kind = pl.get("kind") or pl.get("relation")
|
next_cnt = by_kind.get("next", 0)
|
||||||
cid = pl.get("chunk_id")
|
prev_cnt = by_kind.get("prev", 0)
|
||||||
sid = pl.get("source_id"); tid = pl.get("target_id"); rel = kind
|
chunks = chunk_counts.get(nid, 0)
|
||||||
key = (sid, tid, rel, rule)
|
|
||||||
if key in seen:
|
|
||||||
dup_keys.add(key)
|
|
||||||
else:
|
|
||||||
seen.add(key)
|
|
||||||
|
|
||||||
if is_callout_rule(rule):
|
# Duplikate
|
||||||
callout += 1
|
for pl in pls:
|
||||||
callout_key_counts[(cid, kind, rule)] += 1
|
key = (
|
||||||
if rule.startswith("inline:rel:v1"):
|
str(pl.get("source_id") or ""),
|
||||||
inline += 1
|
str(pl.get("target_id") or ""),
|
||||||
if rule.startswith("edge_defaults:"):
|
str(_rel(pl)),
|
||||||
defaults += 1
|
str(pl.get("rule_id") or ""),
|
||||||
if rule.startswith("explicit:"):
|
)
|
||||||
explicit += 1
|
if key in dup_seen:
|
||||||
|
has_duplicates = True
|
||||||
|
dup_seen.add(key)
|
||||||
|
|
||||||
for _, cnt in callout_key_counts.items():
|
# Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets
|
||||||
if cnt >= 2:
|
call_key_counter = Counter(
|
||||||
|
(pl.get("chunk_id"), _rel(pl), pl.get("rule_id"))
|
||||||
|
for pl in pls
|
||||||
|
if _is_callout(pl)
|
||||||
|
)
|
||||||
|
if any(v >= 2 for v in call_key_counter.values()):
|
||||||
multi_callout_detected = True
|
multi_callout_detected = True
|
||||||
break
|
|
||||||
|
|
||||||
report = {
|
per_note[nid] = {
|
||||||
"prefix": cfg.prefix,
|
"chunks": chunks,
|
||||||
|
"belongs_to": belongs_to,
|
||||||
|
"next": next_cnt,
|
||||||
|
"prev": prev_cnt,
|
||||||
|
"checks": {
|
||||||
|
"belongs_to_equals_chunks": (belongs_to == chunks),
|
||||||
|
"next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
out = {
|
||||||
|
"prefix": prefix,
|
||||||
"counts": {
|
"counts": {
|
||||||
"notes": notes_cnt,
|
"notes": client.count(collection_name=cols["notes"], exact=True).count,
|
||||||
"chunks": chunks_cnt,
|
"chunks": client.count(collection_name=cols["chunks"], exact=True).count,
|
||||||
"edges": edges_cnt,
|
"edges": client.count(collection_name=cols["edges"], exact=True).count,
|
||||||
"edges_by_kind": dict(e_by_kind),
|
"edges_by_kind": edges_by_kind,
|
||||||
"explicit_total": explicit,
|
"explicit_total": explicit_total,
|
||||||
"defaults_total": defaults,
|
"defaults_total": defaults_total,
|
||||||
"callout_total": callout,
|
"callout_total": callout_total,
|
||||||
"inline_total": inline,
|
"inline_total": inline_total,
|
||||||
},
|
},
|
||||||
"per_note_checks": per_note,
|
"per_note_checks": per_note,
|
||||||
"multi_callout_detected": multi_callout_detected,
|
"multi_callout_detected": multi_callout_detected,
|
||||||
"has_duplicates": (len(dup_keys) > 0),
|
"has_duplicates": has_duplicates,
|
||||||
}
|
}
|
||||||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user