mindnet/scripts/audit_edges_vs_expectations.py
Lars e9532e8878
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
script_Überprüfung und Kommentarheader
2025-12-28 10:40:28 +01:00

241 lines
7.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/audit_edges_vs_expectations.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
Zweck:
-------
Prüft Edge-Anzahlen in Qdrant gegen erwartete Werte aus dem Vault.
Validiert strukturelle Integrität des Graphen.
Funktionsweise:
---------------
1. Liest Edges/Chunks/Notes aus Qdrant
2. Ermittelt erwartete Edge-Anzahlen aus Vault:
- belongs_to: sollte == #Chunks pro Note
- next/prev: je Note (#Chunks - 1)
- references: Summe aller Chunk-Wikilinks
- backlink: Summe einzigartiger Wikilinks (Note-Level)
3. Vergleicht IST vs. SOLL
4. Meldet Abweichungen
Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON mit Vergleichs-Ergebnissen
* expected: Erwartete Edge-Anzahlen
* actual: Tatsächliche Edge-Anzahlen
* discrepancies: Abweichungen
* per_note: Details pro Note (mit --details)
- Exit-Code 0: Erfolgreich
Verwendung:
-----------
- Validierung nach Importen
- Debugging von Edge-Problemen
- Qualitätskontrolle
Hinweise:
---------
- Prüft strukturelle, nicht semantische Korrektheit
- Kann bei großen Vaults langsam sein
Aufruf:
-------
python3 -m scripts.audit_edges_vs_expectations --vault ./vault
python3 -m scripts.audit_edges_vs_expectations --vault ./vault --prefix mindnet_dev --details
Parameter:
----------
--vault PATH Pfad zum Vault-Verzeichnis (erforderlich)
--prefix TEXT Collection-Präfix (Default: mindnet)
--details Zeigt Details pro Note
Umgebungsvariablen:
-------------------
QDRANT_URL, QDRANT_API_KEY (optional), COLLECTION_PREFIX
Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0 (2025-09-09): Initial Release
"""
from __future__ import annotations
import argparse
import json
import os
import re
from collections import defaultdict, Counter
from typing import Dict, List, Tuple
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from dotenv import load_dotenv
# Projektmodule nur leichtgewichtige Funktionen
try:
from app.core.parser import read_markdown
except Exception:
# sehr einfacher Fallback für Wikilinks
read_markdown = None
WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
# ------------------------------
# Qdrant Helpers
# ------------------------------
def _names(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _scroll_all(client: QdrantClient, col: str, flt=None, with_payload=True, with_vectors=False, limit=256):
out, next_page = [], None
while True:
pts, next_page = client.scroll(
collection_name=col,
scroll_filter=flt,
with_payload=with_payload,
with_vectors=with_vectors,
limit=limit,
offset=next_page,
)
if not pts:
break
out.extend(pts)
if not next_page:
break
return out
# ------------------------------
# Vault scan
# ------------------------------
def _iter_md(root: str) -> List[str]:
out: List[str] = []
for dp, _, fns in os.walk(root):
for fn in fns:
if fn.lower().endswith(".md"):
p = os.path.join(dp, fn)
if "/.obsidian/" in p.replace("\\", "/"):
continue
out.append(p)
return sorted(out)
def _wikilinks_in_text(text: str) -> List[str]:
return WIKILINK_RE.findall(text or "")
def _wikilinks_per_note(vault_root: str) -> Dict[str, List[str]]:
res: Dict[str, List[str]] = {}
for p in _iter_md(vault_root):
body = ""
try:
if read_markdown:
parsed = read_markdown(p)
body = parsed.body or ""
fm = parsed.frontmatter or {}
nid = fm.get("id") or fm.get("note_id") or os.path.splitext(os.path.basename(p))[0]
else:
with open(p, "r", encoding="utf-8") as f:
txt = f.read()
# sehr einfacher Split: YAML-Frontmatter rausnehmen
if txt.lstrip().startswith("---"):
parts = txt.split("\n---", 1)
body = parts[1] if len(parts) > 1 else txt
else:
body = txt
nid = os.path.splitext(os.path.basename(p))[0]
res[nid] = _wikilinks_in_text(body)
except Exception:
continue
return res
# ------------------------------
# Main Audit
# ------------------------------
def main():
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Vault (für Erwartungswerte)")
ap.add_argument("--prefix", default=os.environ.get("COLLECTION_PREFIX", "mindnet"), help="Collection-Prefix")
ap.add_argument("--details", action="store_true", help="Detail-Listen ausgeben")
args = ap.parse_args()
client = QdrantClient(url=os.environ.get("QDRANT_URL", "http://127.0.0.1:6333"),
api_key=os.environ.get("QDRANT_API_KEY") or None)
notes_col, chunks_col, edges_col = _names(args.prefix)
# Qdrant laden
notes = _scroll_all(client, notes_col, with_payload=True, with_vectors=False)
chunks = _scroll_all(client, chunks_col, with_payload=True, with_vectors=False)
edges = _scroll_all(client, edges_col, with_payload=True, with_vectors=False)
# --- Ist-Zähler
cnt_kind = Counter()
cnt_scope = Counter()
by_note_chunks: Dict[str, int] = defaultdict(int)
chunk_wikilinks_total = 0
for p in chunks:
pl = p.payload or {}
by_note_chunks[pl.get("note_id")] += 1
wl = pl.get("wikilinks") or []
if isinstance(wl, list):
chunk_wikilinks_total += len(wl)
for p in edges:
pl = p.payload or {}
kind = pl.get("kind") or pl.get("edge_type") or "?"
scope = pl.get("scope") or "?"
cnt_kind[kind] += 1
cnt_scope[f"{kind}:{scope}"] += 1
total_chunks = sum(by_note_chunks.values())
# --- Soll-Zähler aus Vault
wl_per_note = _wikilinks_per_note(args.vault)
backlink_expected = sum(len(set(v)) for v in wl_per_note.values())
next_expected = sum(max(c - 1, 0) for c in by_note_chunks.values())
prev_expected = next_expected # symmetrische Kanten
belongs_to_expected = total_chunks
references_expected = chunk_wikilinks_total # aus Chunk-Payloads
# --- Ergebnis
result = {
"qdrant_counts": dict(cnt_kind),
"qdrant_counts_by_scope": dict(cnt_scope),
"chunks_total": total_chunks,
"by_note_chunks": dict(by_note_chunks),
"vault_expected": {
"belongs_to": belongs_to_expected,
"next": next_expected,
"prev": prev_expected,
"references": references_expected,
"backlink": backlink_expected,
},
"deltas": {
"belongs_to": cnt_kind.get("belongs_to", 0) - belongs_to_expected,
"next": cnt_kind.get("next", 0) - next_expected,
"prev": cnt_kind.get("prev", 0) - prev_expected,
"references": cnt_kind.get("references", 0) - references_expected,
"backlink": cnt_kind.get("backlink", 0) - backlink_expected,
},
"collections": {
"notes": notes_col, "chunks": chunks_col, "edges": edges_col
}
}
print(json.dumps(result, ensure_ascii=False, indent=2))
if args.details:
# optionale Stichproben (z. B. fehlerhafte Kantenarten)
pass
if __name__ == "__main__":
main()