#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Name: scripts/audit_vault_vs_qdrant.py Version: v1.0.0 (2025-09-05) Kurzbeschreibung: Prüft die Konsistenz zwischen Obsidian-Vault und Qdrant: - Zählt Markdown-Dateien mit gültiger Frontmatter (title, id, type, status, created). - Zählt Wikilink-Vorkommen im Vault (regex wie in derive_edges.py). - Liest Zählungen aus Qdrant (Notes/Chunks/Edges je kind). - Vergleicht erwartete Wikilink-Anzahl (Vault) vs. tatsächlich importierte Edges (Qdrant). - Listet Auffälligkeiten pro Note (z. B. Wikilinks im Vault, aber keine references in Qdrant). Aufruf (aus Projekt-Root, im venv): python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet Parameter: --vault Pfad zum Vault (z. B. ./vault) --prefix Collection-Prefix in Qdrant (Default: mindnet) --limit Max. Punkte pro Scroll-Seite aus Qdrant (Default: 1000) Voraussetzungen: - Aktives Python venv mit installiertem qdrant-client. - Zugriff auf Qdrant per ENV (QDRANT_URL, QDRANT_API_KEY optional). Hinweise: - Der Wikilink-Regex entspricht dem in app/core/derive_edges.py verwendeten Muster. (Quelle: derive_edges.py) # :contentReference[oaicite:3]{index=3} - Pflicht-Frontmatter wird wie in app/core/parser.py geprüft. (Quelle: parser.py) # :contentReference[oaicite:4]{index=4} - Collection-Namen & 1D-Edge-Vektoren folgen app/core/qdrant.py / qdrant_points.py. (Quellen: qdrant.py, qdrant_points.py) # Changelog: v1.0.0: Erste Version. Autor: mindnet – Datenimporte & Sync """ from __future__ import annotations import argparse, os, glob, re, json from collections import Counter, defaultdict from typing import Dict, List, Tuple, Optional from qdrant_client import QdrantClient from qdrant_client.http import models as rest # --- Regex wie in derive_edges.py (Wikilinks) WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") # :contentReference[oaicite:6]{index=6} # --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik) :contentReference[oaicite:7]{index=7} REQUIRED = ("title","id","type","status","created") def has_required_frontmatter(front: Dict) -> bool: return all(k in front and front[k] not in (None, "") for k in REQUIRED) def read_front_and_body(path: str) -> Tuple[Dict, str]: # Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden) import yaml, unicodedata with open(path, "r", encoding="utf-8") as f: raw = f.read().lstrip("\ufeff") raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n") if raw.startswith("---\n"): end = raw.find("\n---", 4) if end != -1: fm_raw = raw[4:end].strip() body = raw[end+4:].lstrip("\n") try: fm = yaml.safe_load(fm_raw) or {} if not isinstance(fm, dict): fm = {} except Exception: fm = {} return fm, body return {}, raw def slug_file(path: str) -> str: import unicodedata s = os.path.basename(path) if s.endswith(".md"): s = s[:-3] s = unicodedata.normalize("NFKD", s) s = "".join(ch for ch in s if not unicodedata.combining(ch)) s = s.lower().replace(" ", "-") s = re.sub(r"[^a-z0-9\-]+", "", s) s = re.sub(r"-{2,}", "-", s).strip("-") return s def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]: files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)] notes: List[Dict] = [] wikilink_total = 0 for p in files: pn = p.replace("\\","/") if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")): continue fm, body = read_front_and_body(p) if not has_required_frontmatter(fm): continue nid = fm.get("id") title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0] relpath = os.path.relpath(p, vault_root).replace("\\","/") links = list(WIKILINK_RE.finditer(body)) wikilink_total += len(links) notes.append({ "note_id": nid, "title": title, "path": relpath, "wikilink_count": len(links), "file_slug": slug_file(p), }) return notes, wikilink_total def qdrant_client_from_env() -> QdrantClient: import os url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") api_key = os.getenv("QDRANT_API_KEY") or None return QdrantClient(url=url, api_key=api_key) def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000): next_offset = None while True: pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset) for p in pts: yield p if next_offset is None: break def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict: client = qdrant_client_from_env() cols = { "notes": f"{prefix}_notes", "chunks": f"{prefix}_chunks", "edges": f"{prefix}_edges", } counts = {} for k, c in cols.items(): n = 0 for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit): n += 1 counts[k] = n # Edge-Kinds & unresolved zählen kinds = Counter() unresolved = Counter() per_note_refs = defaultdict(int) for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit): pl = p.payload or {} k = pl.get("kind") if k: kinds[k] += 1 if pl.get("status") == "unresolved": unresolved[k] += 1 # für per-Note-Vergleich: references (Volltext) zählen if k == "references": src = pl.get("source_id") if src: per_note_refs[src] += 1 return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)} def main(): ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)") ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix") ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite") args = ap.parse_args() notes, wikilink_total = collect_vault_stats(args.vault) q = collect_qdrant_stats(args.prefix, args.limit) notes_by_id = {n["note_id"]: n for n in notes} # Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant) deltas = [] for nid, n in notes_by_id.items(): expected = n["wikilink_count"] actual = q["per_note_refs"].get(nid, 0) if expected != actual: deltas.append({ "note_id": nid, "title": n["title"], "path": n["path"], "wikilinks_in_vault": expected, "references_in_qdrant": actual, "delta": actual - expected }) out = { "vault": { "notes_with_required_frontmatter": len(notes), "wikilink_occurrences_total": wikilink_total }, "qdrant": { "collections": q["collections"], "counts": q["counts"], "edge_kinds": q["kinds"], "unresolved_by_kind": q["unresolved"] }, "mismatch_notes": deltas[:50], # nur erste 50 ausgeben "mismatch_total": len(deltas) } print(json.dumps(out, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()