mindnet/scripts/audit_vault_vs_qdrant.py
Lars 41d43c2bb6
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
scripts/audit_vault_vs_qdrant.py hinzugefügt
2025-09-05 08:52:23 +02:00

202 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Name: scripts/audit_vault_vs_qdrant.py
Version: v1.0.0 (2025-09-05)
Kurzbeschreibung:
Prüft die Konsistenz zwischen Obsidian-Vault und Qdrant:
- Zählt Markdown-Dateien mit gültiger Frontmatter (title, id, type, status, created).
- Zählt Wikilink-Vorkommen im Vault (regex wie in derive_edges.py).
- Liest Zählungen aus Qdrant (Notes/Chunks/Edges je kind).
- Vergleicht erwartete Wikilink-Anzahl (Vault) vs. tatsächlich importierte Edges (Qdrant).
- Listet Auffälligkeiten pro Note (z. B. Wikilinks im Vault, aber keine references in Qdrant).
Aufruf (aus Projekt-Root, im venv):
python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet
Parameter:
--vault Pfad zum Vault (z. B. ./vault)
--prefix Collection-Prefix in Qdrant (Default: mindnet)
--limit Max. Punkte pro Scroll-Seite aus Qdrant (Default: 1000)
Voraussetzungen:
- Aktives Python venv mit installiertem qdrant-client.
- Zugriff auf Qdrant per ENV (QDRANT_URL, QDRANT_API_KEY optional).
Hinweise:
- Der Wikilink-Regex entspricht dem in app/core/derive_edges.py verwendeten Muster. (Quelle: derive_edges.py) # :contentReference[oaicite:3]{index=3}
- Pflicht-Frontmatter wird wie in app/core/parser.py geprüft. (Quelle: parser.py) # :contentReference[oaicite:4]{index=4}
- Collection-Namen & 1D-Edge-Vektoren folgen app/core/qdrant.py / qdrant_points.py. (Quellen: qdrant.py, qdrant_points.py) #
Changelog:
v1.0.0: Erste Version.
Autor:
mindnet Datenimporte & Sync
"""
from __future__ import annotations
import argparse, os, glob, re, json
from collections import Counter, defaultdict
from typing import Dict, List, Tuple, Optional
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
# --- Regex wie in derive_edges.py (Wikilinks)
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") # :contentReference[oaicite:6]{index=6}
# --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik) :contentReference[oaicite:7]{index=7}
REQUIRED = ("title","id","type","status","created")
def has_required_frontmatter(front: Dict) -> bool:
return all(k in front and front[k] not in (None, "") for k in REQUIRED)
def read_front_and_body(path: str) -> Tuple[Dict, str]:
# Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden)
import yaml, unicodedata
with open(path, "r", encoding="utf-8") as f:
raw = f.read().lstrip("\ufeff")
raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n")
if raw.startswith("---\n"):
end = raw.find("\n---", 4)
if end != -1:
fm_raw = raw[4:end].strip()
body = raw[end+4:].lstrip("\n")
try:
fm = yaml.safe_load(fm_raw) or {}
if not isinstance(fm, dict):
fm = {}
except Exception:
fm = {}
return fm, body
return {}, raw
def slug_file(path: str) -> str:
import unicodedata
s = os.path.basename(path)
if s.endswith(".md"): s = s[:-3]
s = unicodedata.normalize("NFKD", s)
s = "".join(ch for ch in s if not unicodedata.combining(ch))
s = s.lower().replace(" ", "-")
s = re.sub(r"[^a-z0-9\-]+", "", s)
s = re.sub(r"-{2,}", "-", s).strip("-")
return s
def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]:
files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)]
notes: List[Dict] = []
wikilink_total = 0
for p in files:
pn = p.replace("\\","/")
if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
continue
fm, body = read_front_and_body(p)
if not has_required_frontmatter(fm):
continue
nid = fm.get("id")
title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0]
relpath = os.path.relpath(p, vault_root).replace("\\","/")
links = list(WIKILINK_RE.finditer(body))
wikilink_total += len(links)
notes.append({
"note_id": nid,
"title": title,
"path": relpath,
"wikilink_count": len(links),
"file_slug": slug_file(p),
})
return notes, wikilink_total
def qdrant_client_from_env() -> QdrantClient:
import os
url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
api_key = os.getenv("QDRANT_API_KEY") or None
return QdrantClient(url=url, api_key=api_key)
def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000):
next_offset = None
while True:
pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset)
for p in pts:
yield p
if next_offset is None:
break
def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict:
client = qdrant_client_from_env()
cols = {
"notes": f"{prefix}_notes",
"chunks": f"{prefix}_chunks",
"edges": f"{prefix}_edges",
}
counts = {}
for k, c in cols.items():
n = 0
for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit):
n += 1
counts[k] = n
# Edge-Kinds & unresolved zählen
kinds = Counter()
unresolved = Counter()
per_note_refs = defaultdict(int)
for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit):
pl = p.payload or {}
k = pl.get("kind")
if k: kinds[k] += 1
if pl.get("status") == "unresolved":
unresolved[k] += 1
# für per-Note-Vergleich: references (Volltext) zählen
if k == "references":
src = pl.get("source_id")
if src:
per_note_refs[src] += 1
return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)")
ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix")
ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite")
args = ap.parse_args()
notes, wikilink_total = collect_vault_stats(args.vault)
q = collect_qdrant_stats(args.prefix, args.limit)
notes_by_id = {n["note_id"]: n for n in notes}
# Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant)
deltas = []
for nid, n in notes_by_id.items():
expected = n["wikilink_count"]
actual = q["per_note_refs"].get(nid, 0)
if expected != actual:
deltas.append({
"note_id": nid,
"title": n["title"],
"path": n["path"],
"wikilinks_in_vault": expected,
"references_in_qdrant": actual,
"delta": actual - expected
})
out = {
"vault": {
"notes_with_required_frontmatter": len(notes),
"wikilink_occurrences_total": wikilink_total
},
"qdrant": {
"collections": q["collections"],
"counts": q["counts"],
"edge_kinds": q["kinds"],
"unresolved_by_kind": q["unresolved"]
},
"mismatch_notes": deltas[:50], # nur erste 50 ausgeben
"mismatch_total": len(deltas)
}
print(json.dumps(out, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()