Dateien nach "tests" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-11-08 20:52:06 +01:00
parent eb6e4028ff
commit e451ea64ae

View File

@ -5,49 +5,22 @@ check_types_registry_edges.py
-------------------------------------------------- --------------------------------------------------
Prüft, ob die in config/types.yaml hinterlegten Default-Kanten Prüft, ob die in config/types.yaml hinterlegten Default-Kanten
pro Note-Typ tatsächlich in der Qdrant *edges*-Collection auftauchen. pro Note-Typ tatsächlich in der Qdrant *edges*-Collection auftauchen.
Benötigte ENV (wie im Projekt üblich):
- QDRANT_URL (optional; default http://127.0.0.1:6333)
- QDRANT_API_KEY (optional)
- COLLECTION_PREFIX oder MINDNET_PREFIX (bestimmt Collection-Namen)
- TYPES_FILE (optional; default ./config/types.yaml)
Collections (Standard):
- {prefix}_notes
- {prefix}_edges
Ausgabe: JSON-Zeilen mit Countern und ggf. Missing-Hinweisen.
Nutzung:
python3 tests/check_types_registry_edges.py --prefix mindnet
# oder Prefix aus ENV (COLLECTION_PREFIX/MINDNET_PREFIX)
Hinweis:
- Scrollt alle Notes (id type) und alle Edges (edge_type, src_* Felder)
- Mappt Edges zurück auf Note-Typ (über src_note_id oder src_id, heuristisch)
- Vergleicht beobachtete edge_types je Typ mit den in types.yaml geforderten
""" """
import os, sys, json
import os
import sys
import json
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple, Set from typing import Dict, List, Optional, Tuple, Set
from dataclasses import dataclass from dataclasses import dataclass, asdict
try: try:
import yaml # PyYAML import yaml # PyYAML
except Exception as e: except Exception as e:
print(json.dumps({"error": f"PyYAML not installed: {e}"})) print(json.dumps({"error": f"PyYAML not installed: {e}"})); sys.exit(2)
sys.exit(2)
try: try:
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
from qdrant_client.http import models as rest from qdrant_client.http import models as rest # noqa: F401
except Exception as e: except Exception as e:
print(json.dumps({"error": f"qdrant-client not installed: {e}"})) print(json.dumps({"error": f"qdrant-client not installed: {e}"})); sys.exit(2)
sys.exit(2)
@dataclass @dataclass
class Cfg: class Cfg:
@ -58,7 +31,6 @@ class Cfg:
edges: str edges: str
types_file: Path types_file: Path
def _env_prefix() -> Optional[str]: def _env_prefix() -> Optional[str]:
for k in ("COLLECTION_PREFIX", "MINDNET_PREFIX"): for k in ("COLLECTION_PREFIX", "MINDNET_PREFIX"):
v = os.environ.get(k, "").strip() v = os.environ.get(k, "").strip()
@ -66,7 +38,6 @@ def _env_prefix() -> Optional[str]:
return v return v
return None return None
def _load_types_yaml(path: Path) -> Dict: def _load_types_yaml(path: Path) -> Dict:
if not path.exists(): if not path.exists():
print(json.dumps({"warn": f"types.yaml fehlt: {path}. Fallback: keine Vorgaben."})) print(json.dumps({"warn": f"types.yaml fehlt: {path}. Fallback: keine Vorgaben."}))
@ -74,92 +45,65 @@ def _load_types_yaml(path: Path) -> Dict:
try: try:
with path.open("r", encoding="utf-8") as f: with path.open("r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {} data = yaml.safe_load(f) or {}
# erwartet: {"version": "1.0", "types": {type_name: {"edge_defaults":[...]}}}
return data if isinstance(data, dict) else {} return data if isinstance(data, dict) else {}
except Exception as e: except Exception as e:
print(json.dumps({"warn": f"types.yaml defekt ({path}): {e}. Fallback: keine Vorgaben."})) print(json.dumps({"warn": f"types.yaml defekt ({path}): {e}. Fallback: keine Vorgaben."}))
return {} return {}
def _cfg_from_env(argv_prefix: Optional[str]) -> Cfg: def _cfg_from_env(argv_prefix: Optional[str]) -> Cfg:
url = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333").strip() url = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333").strip()
api_key = os.environ.get("QDRANT_API_KEY", "").strip() or None api_key = os.environ.get("QDRANT_API_KEY", "").strip() or None
prefix = (argv_prefix or _env_prefix() or "mindnet").strip() prefix = (argv_prefix or _env_prefix() or "mindnet").strip()
notes = f"{prefix}_notes" notes = f"{prefix}_notes"
edges = f"{prefix}_edges" edges = f"{prefix}_edges"
types_path = Path(os.environ.get("TYPES_FILE", "config/types.yaml")).resolve() types_path = Path(os.environ.get("TYPES_FILE", "config/types.yaml")).resolve()
return Cfg(url=url, api_key=api_key, prefix=prefix, notes=notes, edges=edges, types_file=types_path) return Cfg(url=url, api_key=api_key, prefix=prefix, notes=notes, edges=edges, types_file=types_path)
def _mk_client(cfg: Cfg) -> QdrantClient: def _mk_client(cfg: Cfg) -> QdrantClient:
return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=30.0) return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=30.0)
def _scroll_all_notes(client: QdrantClient, notes_col: str) -> Dict[str, Dict]: def _scroll_all_notes(client: QdrantClient, notes_col: str) -> Dict[str, Dict]:
"""returns dict note_id -> payload"""
out = {} out = {}
offset = None offset = None
while True: while True:
res = client.scroll( points, offset = client.scroll(
collection_name=notes_col, collection_name=notes_col,
scroll_filter=None, # kein Filter, wir holen alles scroll_filter=None, limit=256, offset=offset,
limit=256, with_payload=True, with_vectors=False,
offset=offset,
with_payload=True,
with_vectors=False,
) )
points, offset = res
if not points: if not points:
break break
for p in points: for p in points:
payload = p.payload or {} payload = p.payload or {}
# Normalisierung: note_id kann in 'id' oder 'note_id' liegen
nid = payload.get("note_id") or payload.get("id") or payload.get("uid") or payload.get("slug") nid = payload.get("note_id") or payload.get("id") or payload.get("uid") or payload.get("slug")
if not nid: if not nid:
# try: some pipelines store it also as top-level id; keep point.id fallback
nid = str(p.id) nid = str(p.id)
out[str(nid)] = payload out[str(nid)] = payload
return out return out
def _scroll_all_edges(client: QdrantClient, edges_col: str) -> List[Tuple[str, dict]]: def _scroll_all_edges(client: QdrantClient, edges_col: str) -> List[Tuple[str, dict]]:
"""returns list of tuples (edge_point_id, payload)"""
out = [] out = []
offset = None offset = None
while True: while True:
res = client.scroll( points, offset = client.scroll(
collection_name=edges_col, collection_name=edges_col,
scroll_filter=None, scroll_filter=None, limit=256, offset=offset,
limit=256, with_payload=True, with_vectors=False,
offset=offset,
with_payload=True,
with_vectors=False,
) )
points, offset = res
if not points: if not points:
break break
for p in points: for p in points:
out.append((str(p.id), p.payload or {})) out.append((str(p.id), p.payload or {}))
return out return out
def _guess_src_note_id(ed_pl: dict) -> Optional[str]: def _guess_src_note_id(ed_pl: dict) -> Optional[str]:
"""
Versucht, die Quell-Note-ID aus der Edge-Payload zu lesen.
Unterstützt mehrere mögliche Feldnamen/Schemata.
"""
# gängigste Varianten
for k in ("src_note_id", "note_id", "src_id", "src"): for k in ("src_note_id", "note_id", "src_id", "src"):
nid = ed_pl.get(k) nid = ed_pl.get(k)
if nid: if nid:
return str(nid) return str(nid)
# manchmal liegt sie in 'src_ref' oder 'from'
for k in ("src_ref", "from"): for k in ("src_ref", "from"):
nid = ed_pl.get(k) nid = ed_pl.get(k)
if isinstance(nid, dict): if isinstance(nid, dict):
# z.B. {"kind":"note","id":"..."} oder {"note_id":"..."}
for kk in ("note_id", "id"): for kk in ("note_id", "id"):
if nid.get(kk): if nid.get(kk):
return str(nid[kk]) return str(nid[kk])
@ -167,13 +111,16 @@ def _guess_src_note_id(ed_pl: dict) -> Optional[str]:
return nid return nid
return None return None
def _edge_type(ed_pl: dict) -> Optional[str]: def _edge_type(ed_pl: dict) -> Optional[str]:
for k in ("edge_type", "type", "rel", "relation"): for k in ("edge_type", "type", "rel", "relation"):
if ed_pl.get(k): if ed_pl.get(k):
return str(ed_pl[k]) return str(ed_pl[k])
return None return None
def _cfg_to_jsonable(cfg: Cfg) -> dict:
d = asdict(cfg)
d["types_file"] = str(d.get("types_file"))
return d
def main(argv=None): def main(argv=None):
import argparse import argparse
@ -182,59 +129,47 @@ def main(argv=None):
args = ap.parse_args(argv) args = ap.parse_args(argv)
cfg = _cfg_from_env(args.prefix) cfg = _cfg_from_env(args.prefix)
print(json.dumps({"cfg": cfg.__dict__}, ensure_ascii=False)) print(json.dumps({"cfg": _cfg_to_jsonable(cfg)}, ensure_ascii=False))
# Type-Registry laden
tr = _load_types_yaml(cfg.types_file) tr = _load_types_yaml(cfg.types_file)
types_def = (tr.get("types") if isinstance(tr, dict) else {}) or {} types_def = (tr.get("types") if isinstance(tr, dict) else {}) or {}
print(json.dumps({"types_defined": list(types_def.keys())}, ensure_ascii=False)) print(json.dumps({"types_defined": list(types_def.keys())}, ensure_ascii=False))
client = _mk_client(cfg) client = _mk_client(cfg)
# alle Notes (id -> type) aufbauen
notes = _scroll_all_notes(client, cfg.notes) notes = _scroll_all_notes(client, cfg.notes)
print(json.dumps({"notes_count": len(notes)}, ensure_ascii=False)) print(json.dumps({"notes_count": len(notes)}, ensure_ascii=False))
# alle Edges lesen
edges = _scroll_all_edges(client, cfg.edges) edges = _scroll_all_edges(client, cfg.edges)
print(json.dumps({"edges_count": len(edges)}, ensure_ascii=False)) print(json.dumps({"edges_count": len(edges)}, ensure_ascii=False))
# Map: note_id -> type note_type = {}
note_type: Dict[str, str] = {}
for nid, pl in notes.items(): for nid, pl in notes.items():
t = pl.get("type") or "concept" t = pl.get("type") or "concept"
note_type[str(nid)] = str(t) note_type[str(nid)] = str(t)
# Beobachtete Kanten je Note-Typ sammeln seen = {}
seen: Dict[str, Set[str]] = {} counts = {}
# auch Zähler
counts: Dict[str, Dict[str, int]] = {}
for edge_pid, ed_pl in edges: for edge_pid, ed_pl in edges:
et = _edge_type(ed_pl) et = _edge_type(ed_pl)
if not et: if not et:
# nicht auswertbar
continue continue
src_nid = _guess_src_note_id(ed_pl) src_nid = _guess_src_note_id(ed_pl)
if not src_nid: if not src_nid:
# evtl. chunk->note edges, die nicht auf Note verweisen. überspringen
continue continue
t = note_type.get(src_nid) t = note_type.get(src_nid)
if not t: if not t:
# Quelle unbekannt (z.B. Note nicht (mehr) vorhanden)
continue continue
seen.setdefault(t, set()).add(et) seen.setdefault(t, set()).add(et)
counts.setdefault(t, {}).setdefault(et, 0) counts.setdefault(t, {}).setdefault(et, 0)
counts[t][et] += 1 counts[t][et] += 1
# Erwartete Defaults je Typ aus Registry expected = {}
expected: Dict[str, Set[str]] = {}
for tname, tdef in types_def.items(): for tname, tdef in types_def.items():
eddefs = (tdef or {}).get("edge_defaults") or [] eddefs = (tdef or {}).get("edge_defaults") or []
expected[tname] = set([str(x) for x in eddefs if x]) expected[tname] = set([str(x) for x in eddefs if x])
# Report
for tname, exp in expected.items(): for tname, exp in expected.items():
obs = seen.get(tname, set()) obs = seen.get(tname, set())
missing = sorted(list(exp - obs)) missing = sorted(list(exp - obs))
@ -248,7 +183,6 @@ def main(argv=None):
"counts": counts.get(tname, {}), "counts": counts.get(tname, {}),
}, ensure_ascii=False)) }, ensure_ascii=False))
# Hinweis, wenn keine Typen konfiguriert
if not expected: if not expected:
print(json.dumps({"warn": "Keine Typ-Defaults in types.yaml gefunden (edge_defaults leer?)."})) print(json.dumps({"warn": "Keine Typ-Defaults in types.yaml gefunden (edge_defaults leer?)."}))