mindnet/scripts/audit_vault_vs_qdrant.py
Lars e9532e8878
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
script_Überprüfung und Kommentarheader
2025-12-28 10:40:28 +01:00

225 lines
7.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/audit_vault_vs_qdrant.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
Zweck:
-------
Prüft Konsistenz zwischen Vault und Qdrant.
Vergleicht erwartete vs. tatsächliche Edge-Anzahlen.
Funktionsweise:
---------------
1. Scannt Vault:
- Zählt Markdown-Dateien mit gültiger Frontmatter
- Zählt Wikilink-Vorkommen (regex wie in derive_edges.py)
2. Liest Qdrant:
- Zählt Notes/Chunks/Edges
- Gruppiert Edges nach kind
3. Vergleicht:
- Erwartete Wikilink-Anzahl (Vault) vs. references (Qdrant)
- Listet Auffälligkeiten pro Note
Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON mit Vergleichs-Ergebnissen
* vault_stats: Zählungen aus Vault
* qdrant_stats: Zählungen aus Qdrant
* discrepancies: Abweichungen und Auffälligkeiten
- Exit-Code 0: Erfolgreich
Verwendung:
-----------
- Konsistenz-Check nach Importen
- Validierung der Edge-Erzeugung
- Debugging von fehlenden Links
Hinweise:
---------
- Wikilink-Regex entspricht derive_edges.py
- Prüft strukturelle, nicht semantische Korrektheit
Aufruf:
-------
python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet
Parameter:
----------
--vault PATH Pfad zum Vault-Verzeichnis (erforderlich)
--prefix TEXT Collection-Präfix (Default: mindnet)
--limit INT Max. Punkte pro Scroll-Seite (Default: 1000)
Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0 (2025-09-05): Initial Release
"""
from __future__ import annotations
import argparse, os, glob, re, json
from collections import Counter, defaultdict
from typing import Dict, List, Tuple, Optional
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
# --- Regex wie in derive_edges.py (Wikilinks)
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") # :contentReference[oaicite:6]{index=6}
# --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik) :contentReference[oaicite:7]{index=7}
REQUIRED = ("title","id","type","status","created")
def has_required_frontmatter(front: Dict) -> bool:
return all(k in front and front[k] not in (None, "") for k in REQUIRED)
def read_front_and_body(path: str) -> Tuple[Dict, str]:
# Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden)
import yaml, unicodedata
with open(path, "r", encoding="utf-8") as f:
raw = f.read().lstrip("\ufeff")
raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n")
if raw.startswith("---\n"):
end = raw.find("\n---", 4)
if end != -1:
fm_raw = raw[4:end].strip()
body = raw[end+4:].lstrip("\n")
try:
fm = yaml.safe_load(fm_raw) or {}
if not isinstance(fm, dict):
fm = {}
except Exception:
fm = {}
return fm, body
return {}, raw
def slug_file(path: str) -> str:
import unicodedata
s = os.path.basename(path)
if s.endswith(".md"): s = s[:-3]
s = unicodedata.normalize("NFKD", s)
s = "".join(ch for ch in s if not unicodedata.combining(ch))
s = s.lower().replace(" ", "-")
s = re.sub(r"[^a-z0-9\-]+", "", s)
s = re.sub(r"-{2,}", "-", s).strip("-")
return s
def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]:
files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)]
notes: List[Dict] = []
wikilink_total = 0
for p in files:
pn = p.replace("\\","/")
if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
continue
fm, body = read_front_and_body(p)
if not has_required_frontmatter(fm):
continue
nid = fm.get("id")
title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0]
relpath = os.path.relpath(p, vault_root).replace("\\","/")
links = list(WIKILINK_RE.finditer(body))
wikilink_total += len(links)
notes.append({
"note_id": nid,
"title": title,
"path": relpath,
"wikilink_count": len(links),
"file_slug": slug_file(p),
})
return notes, wikilink_total
def qdrant_client_from_env() -> QdrantClient:
import os
url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
api_key = os.getenv("QDRANT_API_KEY") or None
return QdrantClient(url=url, api_key=api_key)
def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000):
next_offset = None
while True:
pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset)
for p in pts:
yield p
if next_offset is None:
break
def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict:
client = qdrant_client_from_env()
cols = {
"notes": f"{prefix}_notes",
"chunks": f"{prefix}_chunks",
"edges": f"{prefix}_edges",
}
counts = {}
for k, c in cols.items():
n = 0
for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit):
n += 1
counts[k] = n
# Edge-Kinds & unresolved zählen
kinds = Counter()
unresolved = Counter()
per_note_refs = defaultdict(int)
for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit):
pl = p.payload or {}
k = pl.get("kind")
if k: kinds[k] += 1
if pl.get("status") == "unresolved":
unresolved[k] += 1
# für per-Note-Vergleich: references (Volltext) zählen
if k == "references":
src = pl.get("source_id")
if src:
per_note_refs[src] += 1
return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)")
ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix")
ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite")
args = ap.parse_args()
notes, wikilink_total = collect_vault_stats(args.vault)
q = collect_qdrant_stats(args.prefix, args.limit)
notes_by_id = {n["note_id"]: n for n in notes}
# Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant)
deltas = []
for nid, n in notes_by_id.items():
expected = n["wikilink_count"]
actual = q["per_note_refs"].get(nid, 0)
if expected != actual:
deltas.append({
"note_id": nid,
"title": n["title"],
"path": n["path"],
"wikilinks_in_vault": expected,
"references_in_qdrant": actual,
"delta": actual - expected
})
out = {
"vault": {
"notes_with_required_frontmatter": len(notes),
"wikilink_occurrences_total": wikilink_total
},
"qdrant": {
"collections": q["collections"],
"counts": q["counts"],
"edge_kinds": q["kinds"],
"unresolved_by_kind": q["unresolved"]
},
"mismatch_notes": deltas[:50], # nur erste 50 ausgeben
"mismatch_total": len(deltas)
}
print(json.dumps(out, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()