scripts/audit_edges_vs_expectations.py hinzugefügt
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s

This commit is contained in:
Lars 2025-09-09 11:34:08 +02:00
parent 305089fcf6
commit e040fcc0dd

View File

@ -0,0 +1,208 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: audit_edges_vs_expectations.py Prüfe Kanten in Qdrant gegen Vault-Erwartungen
Version: 1.0.0
Datum: 2025-09-09
Zweck
-----
- Liest Edges/Chunks/Notes aus Qdrant.
- Ermittelt erwartete Kanten-Anzahlen aus dem Vault:
* belongs_to : sollte == #Chunks
* next / prev : je Note (#Chunks_in_Note - 1)
* references : Summe aller Chunk-Wikilinks
* backlink : Summe einzigartiger Wikilinks pro Note (Note-Level)
- Vergleicht IST vs. SOLL und meldet Abweichungen.
ENV/Qdrant
----------
QDRANT_URL, QDRANT_API_KEY (optional), COLLECTION_PREFIX (Default: mindnet)
Aufrufe
-------
# Gesamtaudit
python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault
# Mit anderem Prefix
python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --prefix mindnet_dev
# Details anzeigen
python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --details
"""
from __future__ import annotations
import argparse
import json
import os
import re
from collections import defaultdict, Counter
from typing import Dict, List, Tuple
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from dotenv import load_dotenv
# Projektmodule nur leichtgewichtige Funktionen
try:
from app.core.parser import read_markdown
except Exception:
# sehr einfacher Fallback für Wikilinks
read_markdown = None
WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# ------------------------------
# Qdrant Helpers
# ------------------------------
def _names(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _scroll_all(client: QdrantClient, col: str, flt=None, with_payload=True, with_vectors=False, limit=256):
out, next_page = [], None
while True:
pts, next_page = client.scroll(
collection_name=col,
scroll_filter=flt,
with_payload=with_payload,
with_vectors=with_vectors,
limit=limit,
offset=next_page,
)
if not pts:
break
out.extend(pts)
if not next_page:
break
return out
# ------------------------------
# Vault scan
# ------------------------------
def _iter_md(root: str) -> List[str]:
out: List[str] = []
for dp, _, fns in os.walk(root):
for fn in fns:
if fn.lower().endswith(".md"):
p = os.path.join(dp, fn)
if "/.obsidian/" in p.replace("\\", "/"):
continue
out.append(p)
return sorted(out)
def _wikilinks_in_text(text: str) -> List[str]:
return WIKILINK_RE.findall(text or "")
def _wikilinks_per_note(vault_root: str) -> Dict[str, List[str]]:
res: Dict[str, List[str]] = {}
for p in _iter_md(vault_root):
body = ""
try:
if read_markdown:
parsed = read_markdown(p)
body = parsed.body or ""
fm = parsed.frontmatter or {}
nid = fm.get("id") or fm.get("note_id") or os.path.splitext(os.path.basename(p))[0]
else:
with open(p, "r", encoding="utf-8") as f:
txt = f.read()
# sehr einfacher Split: YAML-Frontmatter rausnehmen
if txt.lstrip().startswith("---"):
parts = txt.split("\n---", 1)
body = parts[1] if len(parts) > 1 else txt
else:
body = txt
nid = os.path.splitext(os.path.basename(p))[0]
res[nid] = _wikilinks_in_text(body)
except Exception:
continue
return res
# ------------------------------
# Main Audit
# ------------------------------
def main():
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Vault (für Erwartungswerte)")
ap.add_argument("--prefix", default=os.environ.get("COLLECTION_PREFIX", "mindnet"), help="Collection-Prefix")
ap.add_argument("--details", action="store_true", help="Detail-Listen ausgeben")
args = ap.parse_args()
client = QdrantClient(url=os.environ.get("QDRANT_URL", "http://127.0.0.1:6333"),
api_key=os.environ.get("QDRANT_API_KEY") or None)
notes_col, chunks_col, edges_col = _names(args.prefix)
# Qdrant laden
notes = _scroll_all(client, notes_col, with_payload=True, with_vectors=False)
chunks = _scroll_all(client, chunks_col, with_payload=True, with_vectors=False)
edges = _scroll_all(client, edges_col, with_payload=True, with_vectors=False)
# --- Ist-Zähler
cnt_kind = Counter()
cnt_scope = Counter()
by_note_chunks: Dict[str, int] = defaultdict(int)
chunk_wikilinks_total = 0
for p in chunks:
pl = p.payload or {}
by_note_chunks[pl.get("note_id")] += 1
wl = pl.get("wikilinks") or []
if isinstance(wl, list):
chunk_wikilinks_total += len(wl)
for p in edges:
pl = p.payload or {}
kind = pl.get("kind") or pl.get("edge_type") or "?"
scope = pl.get("scope") or "?"
cnt_kind[kind] += 1
cnt_scope[f"{kind}:{scope}"] += 1
total_chunks = sum(by_note_chunks.values())
# --- Soll-Zähler aus Vault
wl_per_note = _wikilinks_per_note(args.vault)
backlink_expected = sum(len(set(v)) for v in wl_per_note.values())
next_expected = sum(max(c - 1, 0) for c in by_note_chunks.values())
prev_expected = next_expected # symmetrische Kanten
belongs_to_expected = total_chunks
references_expected = chunk_wikilinks_total # aus Chunk-Payloads
# --- Ergebnis
result = {
"qdrant_counts": dict(cnt_kind),
"qdrant_counts_by_scope": dict(cnt_scope),
"chunks_total": total_chunks,
"by_note_chunks": dict(by_note_chunks),
"vault_expected": {
"belongs_to": belongs_to_expected,
"next": next_expected,
"prev": prev_expected,
"references": references_expected,
"backlink": backlink_expected,
},
"deltas": {
"belongs_to": cnt_kind.get("belongs_to", 0) - belongs_to_expected,
"next": cnt_kind.get("next", 0) - next_expected,
"prev": cnt_kind.get("prev", 0) - prev_expected,
"references": cnt_kind.get("references", 0) - references_expected,
"backlink": cnt_kind.get("backlink", 0) - backlink_expected,
},
"collections": {
"notes": notes_col, "chunks": chunks_col, "edges": edges_col
}
}
print(json.dumps(result, ensure_ascii=False, indent=2))
if args.details:
# optionale Stichproben (z. B. fehlerhafte Kantenarten)
pass
if __name__ == "__main__":
main()