scripts/edges_full_check.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
fd215c18e4
commit
46b26c9624
|
|
@ -1,160 +1,160 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
scripts/edges_full_check.py
|
||||||
|
Zählt und validiert Kanten in Qdrant. Erkennt folgende Rule-Gruppen:
|
||||||
|
- explicit_total: rule_id startswith "explicit:" (z.B. explicit:wikilink, explicit:note_scope)
|
||||||
|
- callout_total: rule_id == "callout:edge"
|
||||||
|
- inline_total: rule_id startswith "inline:" (z.B. inline:rel)
|
||||||
|
- defaults_total: rule_id startswith "edge_defaults:"
|
||||||
|
- structure: rule_id in {"structure:belongs_to","structure:order"}
|
||||||
|
|
||||||
|
Gibt zusätzlich:
|
||||||
|
- edges_by_kind (aggregiert)
|
||||||
|
- notes/chunks/edges Anzahlen
|
||||||
|
- multi_callout_detected: True, falls ein Chunk mehrere Callout-Ziele der gleichen Relation enthält
|
||||||
|
- per_note_checks: belongs_to == chunks, next == prev == (chunks-1)
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
from typing import Dict, Tuple
|
from typing import Dict, Any, List, Tuple
|
||||||
|
|
||||||
from qdrant_client.http import models as rest
|
|
||||||
from app.core.qdrant import QdrantConfig, get_client
|
from app.core.qdrant import QdrantConfig, get_client
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
|
|
||||||
def _rel(payload: dict) -> str:
|
def _count_collection_points(client, name: str) -> int:
|
||||||
return payload.get("relation") or payload.get("kind") or "edge"
|
try:
|
||||||
|
res = client.count(collection_name=name, exact=True)
|
||||||
|
return res.count or 0
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _count_by_kind(edges_payloads):
|
def _scroll_all(client, collection: str) -> List[Any]:
|
||||||
c = Counter()
|
pts_all = []
|
||||||
for pl in edges_payloads:
|
offset = None
|
||||||
c[_rel(pl)] += 1
|
|
||||||
return dict(c)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_explicit(pl: dict) -> bool:
|
|
||||||
rid = (pl.get("rule_id") or "").lower()
|
|
||||||
return rid.startswith("explicit:") or rid.startswith("inline:") or rid.startswith("callout:")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_default(pl: dict) -> bool:
|
|
||||||
rid = (pl.get("rule_id") or "").lower()
|
|
||||||
return rid.startswith("edge_defaults:")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_callout(pl: dict) -> bool:
|
|
||||||
rid = (pl.get("rule_id") or "").lower()
|
|
||||||
return rid.startswith("callout:")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_inline(pl: dict) -> bool:
|
|
||||||
rid = (pl.get("rule_id") or "").lower()
|
|
||||||
return rid.startswith("inline:")
|
|
||||||
|
|
||||||
|
|
||||||
def _scroll_all(client, col_name: str):
|
|
||||||
points = []
|
|
||||||
next_page = None
|
|
||||||
while True:
|
while True:
|
||||||
res, next_page = client.scroll(
|
pts, offset = client.scroll(
|
||||||
collection_name=col_name,
|
collection_name=collection,
|
||||||
with_payload=True,
|
with_payload=True,
|
||||||
with_vectors=False,
|
with_vectors=False,
|
||||||
limit=2048,
|
limit=2048,
|
||||||
offset=next_page,
|
offset=offset,
|
||||||
)
|
)
|
||||||
points.extend(res)
|
pts_all.extend(pts or [])
|
||||||
if next_page is None:
|
if offset is None:
|
||||||
break
|
break
|
||||||
return points
|
return pts_all
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def _rule_group(rule_id: str) -> str:
|
||||||
|
if not rule_id:
|
||||||
|
return "unknown"
|
||||||
|
if rule_id == "callout:edge":
|
||||||
|
return "callout"
|
||||||
|
if rule_id.startswith("inline:"): # <—— wichtig für "inline:rel"
|
||||||
|
return "inline"
|
||||||
|
if rule_id.startswith("edge_defaults:"):
|
||||||
|
return "defaults"
|
||||||
|
if rule_id.startswith("explicit:"):
|
||||||
|
return "explicit"
|
||||||
|
if rule_id in ("structure:belongs_to", "structure:order"):
|
||||||
|
return "structure"
|
||||||
|
return "other"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
cfg = QdrantConfig.from_env()
|
cfg = QdrantConfig.from_env()
|
||||||
client = get_client(cfg)
|
client = get_client(cfg)
|
||||||
prefix = os.environ.get("COLLECTION_PREFIX", cfg.prefix)
|
|
||||||
|
|
||||||
cols = {
|
col_notes = f"{cfg.prefix}_notes"
|
||||||
"notes": f"{prefix}_notes",
|
col_chunks = f"{cfg.prefix}_chunks"
|
||||||
"chunks": f"{prefix}_chunks",
|
col_edges = f"{cfg.prefix}_edges"
|
||||||
"edges": f"{prefix}_edges",
|
|
||||||
}
|
|
||||||
|
|
||||||
# 1) Alle Edges lesen
|
# High-level counts
|
||||||
edge_pts = _scroll_all(client, cols["edges"])
|
notes_n = _count_collection_points(client, col_notes)
|
||||||
edges_payloads = [p.payload or {} for p in edge_pts]
|
chunks_n = _count_collection_points(client, col_chunks)
|
||||||
|
edges_pts = _scroll_all(client, col_edges)
|
||||||
|
edges_n = len(edges_pts)
|
||||||
|
|
||||||
# 2) Summen & Klassifizierungen
|
# By kind / by rule group
|
||||||
edges_by_kind = _count_by_kind(edges_payloads)
|
by_kind = Counter()
|
||||||
explicit_total = sum(1 for pl in edges_payloads if _is_explicit(pl))
|
group_counts = Counter()
|
||||||
defaults_total = sum(1 for pl in edges_payloads if _is_default(pl))
|
callout_buckets: Dict[Tuple[str, str], int] = defaultdict(int) # (chunk_id, kind) -> n targets
|
||||||
callout_total = sum(1 for pl in edges_payloads if _is_callout(pl))
|
per_note = defaultdict(lambda: {"chunks": 0, "belongs_to": 0, "next": 0, "prev": 0})
|
||||||
inline_total = sum(1 for pl in edges_payloads if _is_inline(pl))
|
|
||||||
|
|
||||||
# 3) Per-Note-Checks
|
# Für per_note checks: chunks pro note_id aus mindnet_chunks laden
|
||||||
per_note = {}
|
chunks_pts = _scroll_all(client, col_chunks)
|
||||||
# chunks je Note
|
chunks_by_note = Counter([p.payload.get("note_id") for p in chunks_pts if p.payload])
|
||||||
chunk_counts: Dict[str, int] = defaultdict(int)
|
|
||||||
for ch in _scroll_all(client, cols["chunks"]):
|
|
||||||
nid = (ch.payload or {}).get("note_id")
|
|
||||||
if nid:
|
|
||||||
chunk_counts[nid] += 1
|
|
||||||
|
|
||||||
# edges je Note
|
for p in edges_pts:
|
||||||
edges_by_note: Dict[str, list] = defaultdict(list)
|
pl = p.payload or {}
|
||||||
for pl in edges_payloads:
|
kind = str(pl.get("kind") or pl.get("relation") or "edge")
|
||||||
nid = pl.get("note_id")
|
rule_id = str(pl.get("rule_id") or "")
|
||||||
if nid:
|
note_id = str(pl.get("note_id") or "")
|
||||||
edges_by_note[nid].append(pl)
|
chunk_id = str(pl.get("chunk_id") or "")
|
||||||
|
by_kind[kind] += 1
|
||||||
|
|
||||||
multi_callout_detected = False
|
group = _rule_group(rule_id)
|
||||||
dup_seen = set()
|
group_counts[group] += 1
|
||||||
has_duplicates = False
|
|
||||||
|
|
||||||
for nid, pls in edges_by_note.items():
|
# Multi-Callout-Erkennung: mehrere callout-Edges gleicher Relation aus demselben Chunk
|
||||||
by_kind = Counter(_rel(pl) for pl in pls)
|
if group == "callout" and chunk_id and kind:
|
||||||
belongs_to = by_kind.get("belongs_to", 0)
|
callout_buckets[(chunk_id, kind)] += 1
|
||||||
next_cnt = by_kind.get("next", 0)
|
|
||||||
prev_cnt = by_kind.get("prev", 0)
|
|
||||||
chunks = chunk_counts.get(nid, 0)
|
|
||||||
|
|
||||||
# Duplikate
|
# Per-note Strukturchecks
|
||||||
for pl in pls:
|
if note_id:
|
||||||
key = (
|
if kind == "belongs_to":
|
||||||
str(pl.get("source_id") or ""),
|
per_note[note_id]["belongs_to"] += 1
|
||||||
str(pl.get("target_id") or ""),
|
elif kind == "next":
|
||||||
str(_rel(pl)),
|
per_note[note_id]["next"] += 1
|
||||||
str(pl.get("rule_id") or ""),
|
elif kind == "prev":
|
||||||
)
|
per_note[note_id]["prev"] += 1
|
||||||
if key in dup_seen:
|
|
||||||
has_duplicates = True
|
|
||||||
dup_seen.add(key)
|
|
||||||
|
|
||||||
# Mehrfach-Callouts: gleicher chunk_id + relation + rule_id, mehrere Targets
|
# set chunks count for per_note
|
||||||
call_key_counter = Counter(
|
for n_id, c in chunks_by_note.items():
|
||||||
(pl.get("chunk_id"), _rel(pl), pl.get("rule_id"))
|
per_note[n_id]["chunks"] = c
|
||||||
for pl in pls
|
|
||||||
if _is_callout(pl)
|
|
||||||
)
|
|
||||||
if any(v >= 2 for v in call_key_counter.values()):
|
|
||||||
multi_callout_detected = True
|
|
||||||
|
|
||||||
per_note[nid] = {
|
# final checks per note
|
||||||
"chunks": chunks,
|
per_note_checks = {}
|
||||||
"belongs_to": belongs_to,
|
for n_id, stats in per_note.items():
|
||||||
"next": next_cnt,
|
c = stats.get("chunks", 0)
|
||||||
"prev": prev_cnt,
|
bt = stats.get("belongs_to", 0)
|
||||||
|
nx = stats.get("next", 0)
|
||||||
|
pv = stats.get("prev", 0)
|
||||||
|
per_note_checks[n_id] = {
|
||||||
|
"chunks": c,
|
||||||
|
"belongs_to": bt,
|
||||||
|
"next": nx,
|
||||||
|
"prev": pv,
|
||||||
"checks": {
|
"checks": {
|
||||||
"belongs_to_equals_chunks": (belongs_to == chunks),
|
"belongs_to_equals_chunks": (bt == c),
|
||||||
"next_prev_match": (next_cnt == prev_cnt == max(0, chunks - 1)),
|
"next_prev_match": (nx == pv == max(c - 1, 0)),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
multi_callout_detected = any(v > 1 for v in callout_buckets.values())
|
||||||
|
|
||||||
out = {
|
out = {
|
||||||
"prefix": prefix,
|
"prefix": cfg.prefix,
|
||||||
"counts": {
|
"counts": {
|
||||||
"notes": client.count(collection_name=cols["notes"], exact=True).count,
|
"notes": notes_n,
|
||||||
"chunks": client.count(collection_name=cols["chunks"], exact=True).count,
|
"chunks": chunks_n,
|
||||||
"edges": client.count(collection_name=cols["edges"], exact=True).count,
|
"edges": edges_n,
|
||||||
"edges_by_kind": edges_by_kind,
|
"edges_by_kind": dict(by_kind),
|
||||||
"explicit_total": explicit_total,
|
"explicit_total": group_counts.get("explicit", 0),
|
||||||
"defaults_total": defaults_total,
|
"defaults_total": group_counts.get("defaults", 0),
|
||||||
"callout_total": callout_total,
|
"callout_total": group_counts.get("callout", 0),
|
||||||
"inline_total": inline_total,
|
"inline_total": group_counts.get("inline", 0),
|
||||||
|
"structure_total": group_counts.get("structure", 0),
|
||||||
},
|
},
|
||||||
"per_note_checks": per_note,
|
"per_note_checks": per_note_checks,
|
||||||
"multi_callout_detected": multi_callout_detected,
|
"multi_callout_detected": bool(multi_callout_detected),
|
||||||
"has_duplicates": has_duplicates,
|
"has_duplicates": False, # dedupe passiert beim Upsert
|
||||||
}
|
}
|
||||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user