202 lines
7.6 KiB
Python
202 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Name: scripts/audit_vault_vs_qdrant.py
|
||
Version: v1.0.0 (2025-09-05)
|
||
Kurzbeschreibung:
|
||
Prüft die Konsistenz zwischen Obsidian-Vault und Qdrant:
|
||
- Zählt Markdown-Dateien mit gültiger Frontmatter (title, id, type, status, created).
|
||
- Zählt Wikilink-Vorkommen im Vault (regex wie in derive_edges.py).
|
||
- Liest Zählungen aus Qdrant (Notes/Chunks/Edges je kind).
|
||
- Vergleicht erwartete Wikilink-Anzahl (Vault) vs. tatsächlich importierte Edges (Qdrant).
|
||
- Listet Auffälligkeiten pro Note (z. B. Wikilinks im Vault, aber keine references in Qdrant).
|
||
|
||
Aufruf (aus Projekt-Root, im venv):
|
||
python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet
|
||
|
||
Parameter:
|
||
--vault Pfad zum Vault (z. B. ./vault)
|
||
--prefix Collection-Prefix in Qdrant (Default: mindnet)
|
||
--limit Max. Punkte pro Scroll-Seite aus Qdrant (Default: 1000)
|
||
|
||
Voraussetzungen:
|
||
- Aktives Python venv mit installiertem qdrant-client.
|
||
- Zugriff auf Qdrant per ENV (QDRANT_URL, QDRANT_API_KEY optional).
|
||
|
||
Hinweise:
|
||
- Der Wikilink-Regex entspricht dem in app/core/derive_edges.py verwendeten Muster. (Quelle: derive_edges.py) # :contentReference[oaicite:3]{index=3}
|
||
- Pflicht-Frontmatter wird wie in app/core/parser.py geprüft. (Quelle: parser.py) # :contentReference[oaicite:4]{index=4}
|
||
- Collection-Namen & 1D-Edge-Vektoren folgen app/core/qdrant.py / qdrant_points.py. (Quellen: qdrant.py, qdrant_points.py) #
|
||
|
||
Changelog:
|
||
v1.0.0: Erste Version.
|
||
|
||
Autor:
|
||
mindnet – Datenimporte & Sync
|
||
"""
|
||
from __future__ import annotations
|
||
import argparse, os, glob, re, json
|
||
from collections import Counter, defaultdict
|
||
from typing import Dict, List, Tuple, Optional
|
||
|
||
from qdrant_client import QdrantClient
|
||
from qdrant_client.http import models as rest
|
||
|
||
# --- Regex wie in derive_edges.py (Wikilinks)
|
||
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") # :contentReference[oaicite:6]{index=6}
|
||
|
||
# --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik) :contentReference[oaicite:7]{index=7}
|
||
REQUIRED = ("title","id","type","status","created")
|
||
|
||
def has_required_frontmatter(front: Dict) -> bool:
|
||
return all(k in front and front[k] not in (None, "") for k in REQUIRED)
|
||
|
||
def read_front_and_body(path: str) -> Tuple[Dict, str]:
|
||
# Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden)
|
||
import yaml, unicodedata
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
raw = f.read().lstrip("\ufeff")
|
||
raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n")
|
||
if raw.startswith("---\n"):
|
||
end = raw.find("\n---", 4)
|
||
if end != -1:
|
||
fm_raw = raw[4:end].strip()
|
||
body = raw[end+4:].lstrip("\n")
|
||
try:
|
||
fm = yaml.safe_load(fm_raw) or {}
|
||
if not isinstance(fm, dict):
|
||
fm = {}
|
||
except Exception:
|
||
fm = {}
|
||
return fm, body
|
||
return {}, raw
|
||
|
||
def slug_file(path: str) -> str:
|
||
import unicodedata
|
||
s = os.path.basename(path)
|
||
if s.endswith(".md"): s = s[:-3]
|
||
s = unicodedata.normalize("NFKD", s)
|
||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||
s = s.lower().replace(" ", "-")
|
||
s = re.sub(r"[^a-z0-9\-]+", "", s)
|
||
s = re.sub(r"-{2,}", "-", s).strip("-")
|
||
return s
|
||
|
||
def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]:
|
||
files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)]
|
||
notes: List[Dict] = []
|
||
wikilink_total = 0
|
||
for p in files:
|
||
pn = p.replace("\\","/")
|
||
if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
|
||
continue
|
||
fm, body = read_front_and_body(p)
|
||
if not has_required_frontmatter(fm):
|
||
continue
|
||
nid = fm.get("id")
|
||
title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0]
|
||
relpath = os.path.relpath(p, vault_root).replace("\\","/")
|
||
links = list(WIKILINK_RE.finditer(body))
|
||
wikilink_total += len(links)
|
||
notes.append({
|
||
"note_id": nid,
|
||
"title": title,
|
||
"path": relpath,
|
||
"wikilink_count": len(links),
|
||
"file_slug": slug_file(p),
|
||
})
|
||
return notes, wikilink_total
|
||
|
||
def qdrant_client_from_env() -> QdrantClient:
|
||
import os
|
||
url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
|
||
api_key = os.getenv("QDRANT_API_KEY") or None
|
||
return QdrantClient(url=url, api_key=api_key)
|
||
|
||
def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000):
|
||
next_offset = None
|
||
while True:
|
||
pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset)
|
||
for p in pts:
|
||
yield p
|
||
if next_offset is None:
|
||
break
|
||
|
||
def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict:
|
||
client = qdrant_client_from_env()
|
||
cols = {
|
||
"notes": f"{prefix}_notes",
|
||
"chunks": f"{prefix}_chunks",
|
||
"edges": f"{prefix}_edges",
|
||
}
|
||
counts = {}
|
||
for k, c in cols.items():
|
||
n = 0
|
||
for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit):
|
||
n += 1
|
||
counts[k] = n
|
||
|
||
# Edge-Kinds & unresolved zählen
|
||
kinds = Counter()
|
||
unresolved = Counter()
|
||
per_note_refs = defaultdict(int)
|
||
for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit):
|
||
pl = p.payload or {}
|
||
k = pl.get("kind")
|
||
if k: kinds[k] += 1
|
||
if pl.get("status") == "unresolved":
|
||
unresolved[k] += 1
|
||
# für per-Note-Vergleich: references (Volltext) zählen
|
||
if k == "references":
|
||
src = pl.get("source_id")
|
||
if src:
|
||
per_note_refs[src] += 1
|
||
|
||
return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)}
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)")
|
||
ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix")
|
||
ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite")
|
||
args = ap.parse_args()
|
||
|
||
notes, wikilink_total = collect_vault_stats(args.vault)
|
||
q = collect_qdrant_stats(args.prefix, args.limit)
|
||
|
||
notes_by_id = {n["note_id"]: n for n in notes}
|
||
|
||
# Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant)
|
||
deltas = []
|
||
for nid, n in notes_by_id.items():
|
||
expected = n["wikilink_count"]
|
||
actual = q["per_note_refs"].get(nid, 0)
|
||
if expected != actual:
|
||
deltas.append({
|
||
"note_id": nid,
|
||
"title": n["title"],
|
||
"path": n["path"],
|
||
"wikilinks_in_vault": expected,
|
||
"references_in_qdrant": actual,
|
||
"delta": actual - expected
|
||
})
|
||
|
||
out = {
|
||
"vault": {
|
||
"notes_with_required_frontmatter": len(notes),
|
||
"wikilink_occurrences_total": wikilink_total
|
||
},
|
||
"qdrant": {
|
||
"collections": q["collections"],
|
||
"counts": q["counts"],
|
||
"edge_kinds": q["kinds"],
|
||
"unresolved_by_kind": q["unresolved"]
|
||
},
|
||
"mismatch_notes": deltas[:50], # nur erste 50 ausgeben
|
||
"mismatch_total": len(deltas)
|
||
}
|
||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||
|
||
if __name__ == "__main__":
|
||
main()
|