mindnet/scripts/audit_edges_vs_expectations.py
Lars e040fcc0dd
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
scripts/audit_edges_vs_expectations.py hinzugefügt
2025-09-09 11:34:08 +02:00

209 lines
6.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: audit_edges_vs_expectations.py — Prüfe Kanten in Qdrant gegen Vault-Erwartungen
Version: 1.0.0
Datum: 2025-09-09
Zweck
-----
- Liest Edges/Chunks/Notes aus Qdrant.
- Ermittelt erwartete Kanten-Anzahlen aus dem Vault:
* belongs_to : sollte == #Chunks
* next / prev : je Note (#Chunks_in_Note - 1)
* references : Summe aller Chunk-Wikilinks
* backlink : Summe einzigartiger Wikilinks pro Note (Note-Level)
- Vergleicht IST vs. SOLL und meldet Abweichungen.
ENV/Qdrant
----------
QDRANT_URL, QDRANT_API_KEY (optional), COLLECTION_PREFIX (Default: mindnet)
Aufrufe
-------
# Gesamtaudit
python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault
# Mit anderem Prefix
python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --prefix mindnet_dev
# Details anzeigen
python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --details
"""
from __future__ import annotations
import argparse
import json
import os
import re
from collections import defaultdict, Counter
from typing import Dict, List, Tuple
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from dotenv import load_dotenv
# Projektmodule nur leichtgewichtige Funktionen
try:
from app.core.parser import read_markdown
except Exception:
# sehr einfacher Fallback für Wikilinks
read_markdown = None
WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# ------------------------------
# Qdrant Helpers
# ------------------------------
def _names(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _scroll_all(client: QdrantClient, col: str, flt=None, with_payload=True, with_vectors=False, limit=256):
out, next_page = [], None
while True:
pts, next_page = client.scroll(
collection_name=col,
scroll_filter=flt,
with_payload=with_payload,
with_vectors=with_vectors,
limit=limit,
offset=next_page,
)
if not pts:
break
out.extend(pts)
if not next_page:
break
return out
# ------------------------------
# Vault scan
# ------------------------------
def _iter_md(root: str) -> List[str]:
out: List[str] = []
for dp, _, fns in os.walk(root):
for fn in fns:
if fn.lower().endswith(".md"):
p = os.path.join(dp, fn)
if "/.obsidian/" in p.replace("\\", "/"):
continue
out.append(p)
return sorted(out)
def _wikilinks_in_text(text: str) -> List[str]:
return WIKILINK_RE.findall(text or "")
def _wikilinks_per_note(vault_root: str) -> Dict[str, List[str]]:
res: Dict[str, List[str]] = {}
for p in _iter_md(vault_root):
body = ""
try:
if read_markdown:
parsed = read_markdown(p)
body = parsed.body or ""
fm = parsed.frontmatter or {}
nid = fm.get("id") or fm.get("note_id") or os.path.splitext(os.path.basename(p))[0]
else:
with open(p, "r", encoding="utf-8") as f:
txt = f.read()
# sehr einfacher Split: YAML-Frontmatter rausnehmen
if txt.lstrip().startswith("---"):
parts = txt.split("\n---", 1)
body = parts[1] if len(parts) > 1 else txt
else:
body = txt
nid = os.path.splitext(os.path.basename(p))[0]
res[nid] = _wikilinks_in_text(body)
except Exception:
continue
return res
# ------------------------------
# Main Audit
# ------------------------------
def main():
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Vault (für Erwartungswerte)")
ap.add_argument("--prefix", default=os.environ.get("COLLECTION_PREFIX", "mindnet"), help="Collection-Prefix")
ap.add_argument("--details", action="store_true", help="Detail-Listen ausgeben")
args = ap.parse_args()
client = QdrantClient(url=os.environ.get("QDRANT_URL", "http://127.0.0.1:6333"),
api_key=os.environ.get("QDRANT_API_KEY") or None)
notes_col, chunks_col, edges_col = _names(args.prefix)
# Qdrant laden
notes = _scroll_all(client, notes_col, with_payload=True, with_vectors=False)
chunks = _scroll_all(client, chunks_col, with_payload=True, with_vectors=False)
edges = _scroll_all(client, edges_col, with_payload=True, with_vectors=False)
# --- Ist-Zähler
cnt_kind = Counter()
cnt_scope = Counter()
by_note_chunks: Dict[str, int] = defaultdict(int)
chunk_wikilinks_total = 0
for p in chunks:
pl = p.payload or {}
by_note_chunks[pl.get("note_id")] += 1
wl = pl.get("wikilinks") or []
if isinstance(wl, list):
chunk_wikilinks_total += len(wl)
for p in edges:
pl = p.payload or {}
kind = pl.get("kind") or pl.get("edge_type") or "?"
scope = pl.get("scope") or "?"
cnt_kind[kind] += 1
cnt_scope[f"{kind}:{scope}"] += 1
total_chunks = sum(by_note_chunks.values())
# --- Soll-Zähler aus Vault
wl_per_note = _wikilinks_per_note(args.vault)
backlink_expected = sum(len(set(v)) for v in wl_per_note.values())
next_expected = sum(max(c - 1, 0) for c in by_note_chunks.values())
prev_expected = next_expected # symmetrische Kanten
belongs_to_expected = total_chunks
references_expected = chunk_wikilinks_total # aus Chunk-Payloads
# --- Ergebnis
result = {
"qdrant_counts": dict(cnt_kind),
"qdrant_counts_by_scope": dict(cnt_scope),
"chunks_total": total_chunks,
"by_note_chunks": dict(by_note_chunks),
"vault_expected": {
"belongs_to": belongs_to_expected,
"next": next_expected,
"prev": prev_expected,
"references": references_expected,
"backlink": backlink_expected,
},
"deltas": {
"belongs_to": cnt_kind.get("belongs_to", 0) - belongs_to_expected,
"next": cnt_kind.get("next", 0) - next_expected,
"prev": cnt_kind.get("prev", 0) - prev_expected,
"references": cnt_kind.get("references", 0) - references_expected,
"backlink": cnt_kind.get("backlink", 0) - backlink_expected,
},
"collections": {
"notes": notes_col, "chunks": chunks_col, "edges": edges_col
}
}
print(json.dumps(result, ensure_ascii=False, indent=2))
if args.details:
# optionale Stichproben (z. B. fehlerhafte Kantenarten)
pass
if __name__ == "__main__":
main()