From 41d43c2bb638a45cfa7eab828ff9d96ab63cc907 Mon Sep 17 00:00:00 2001 From: Lars Date: Fri, 5 Sep 2025 08:52:23 +0200 Subject: [PATCH] =?UTF-8?q?scripts/audit=5Fvault=5Fvs=5Fqdrant.py=20hinzug?= =?UTF-8?q?ef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/audit_vault_vs_qdrant.py | 201 +++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 scripts/audit_vault_vs_qdrant.py diff --git a/scripts/audit_vault_vs_qdrant.py b/scripts/audit_vault_vs_qdrant.py new file mode 100644 index 0000000..b295b3c --- /dev/null +++ b/scripts/audit_vault_vs_qdrant.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Name: scripts/audit_vault_vs_qdrant.py +Version: v1.0.0 (2025-09-05) +Kurzbeschreibung: + Prüft die Konsistenz zwischen Obsidian-Vault und Qdrant: + - Zählt Markdown-Dateien mit gültiger Frontmatter (title, id, type, status, created). + - Zählt Wikilink-Vorkommen im Vault (regex wie in derive_edges.py). + - Liest Zählungen aus Qdrant (Notes/Chunks/Edges je kind). + - Vergleicht erwartete Wikilink-Anzahl (Vault) vs. tatsächlich importierte Edges (Qdrant). + - Listet Auffälligkeiten pro Note (z. B. Wikilinks im Vault, aber keine references in Qdrant). + +Aufruf (aus Projekt-Root, im venv): + python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet + +Parameter: + --vault Pfad zum Vault (z. B. ./vault) + --prefix Collection-Prefix in Qdrant (Default: mindnet) + --limit Max. Punkte pro Scroll-Seite aus Qdrant (Default: 1000) + +Voraussetzungen: + - Aktives Python venv mit installiertem qdrant-client. + - Zugriff auf Qdrant per ENV (QDRANT_URL, QDRANT_API_KEY optional). + +Hinweise: + - Der Wikilink-Regex entspricht dem in app/core/derive_edges.py verwendeten Muster. (Quelle: derive_edges.py) # :contentReference[oaicite:3]{index=3} + - Pflicht-Frontmatter wird wie in app/core/parser.py geprüft. (Quelle: parser.py) # :contentReference[oaicite:4]{index=4} + - Collection-Namen & 1D-Edge-Vektoren folgen app/core/qdrant.py / qdrant_points.py. (Quellen: qdrant.py, qdrant_points.py) # + +Changelog: + v1.0.0: Erste Version. + +Autor: + mindnet – Datenimporte & Sync +""" +from __future__ import annotations +import argparse, os, glob, re, json +from collections import Counter, defaultdict +from typing import Dict, List, Tuple, Optional + +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest + +# --- Regex wie in derive_edges.py (Wikilinks) +WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") # :contentReference[oaicite:6]{index=6} + +# --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik) :contentReference[oaicite:7]{index=7} +REQUIRED = ("title","id","type","status","created") + +def has_required_frontmatter(front: Dict) -> bool: + return all(k in front and front[k] not in (None, "") for k in REQUIRED) + +def read_front_and_body(path: str) -> Tuple[Dict, str]: + # Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden) + import yaml, unicodedata + with open(path, "r", encoding="utf-8") as f: + raw = f.read().lstrip("\ufeff") + raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n") + if raw.startswith("---\n"): + end = raw.find("\n---", 4) + if end != -1: + fm_raw = raw[4:end].strip() + body = raw[end+4:].lstrip("\n") + try: + fm = yaml.safe_load(fm_raw) or {} + if not isinstance(fm, dict): + fm = {} + except Exception: + fm = {} + return fm, body + return {}, raw + +def slug_file(path: str) -> str: + import unicodedata + s = os.path.basename(path) + if s.endswith(".md"): s = s[:-3] + s = unicodedata.normalize("NFKD", s) + s = "".join(ch for ch in s if not unicodedata.combining(ch)) + s = s.lower().replace(" ", "-") + s = re.sub(r"[^a-z0-9\-]+", "", s) + s = re.sub(r"-{2,}", "-", s).strip("-") + return s + +def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]: + files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)] + notes: List[Dict] = [] + wikilink_total = 0 + for p in files: + pn = p.replace("\\","/") + if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")): + continue + fm, body = read_front_and_body(p) + if not has_required_frontmatter(fm): + continue + nid = fm.get("id") + title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0] + relpath = os.path.relpath(p, vault_root).replace("\\","/") + links = list(WIKILINK_RE.finditer(body)) + wikilink_total += len(links) + notes.append({ + "note_id": nid, + "title": title, + "path": relpath, + "wikilink_count": len(links), + "file_slug": slug_file(p), + }) + return notes, wikilink_total + +def qdrant_client_from_env() -> QdrantClient: + import os + url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") + api_key = os.getenv("QDRANT_API_KEY") or None + return QdrantClient(url=url, api_key=api_key) + +def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000): + next_offset = None + while True: + pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset) + for p in pts: + yield p + if next_offset is None: + break + +def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict: + client = qdrant_client_from_env() + cols = { + "notes": f"{prefix}_notes", + "chunks": f"{prefix}_chunks", + "edges": f"{prefix}_edges", + } + counts = {} + for k, c in cols.items(): + n = 0 + for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit): + n += 1 + counts[k] = n + + # Edge-Kinds & unresolved zählen + kinds = Counter() + unresolved = Counter() + per_note_refs = defaultdict(int) + for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit): + pl = p.payload or {} + k = pl.get("kind") + if k: kinds[k] += 1 + if pl.get("status") == "unresolved": + unresolved[k] += 1 + # für per-Note-Vergleich: references (Volltext) zählen + if k == "references": + src = pl.get("source_id") + if src: + per_note_refs[src] += 1 + + return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)} + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)") + ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix") + ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite") + args = ap.parse_args() + + notes, wikilink_total = collect_vault_stats(args.vault) + q = collect_qdrant_stats(args.prefix, args.limit) + + notes_by_id = {n["note_id"]: n for n in notes} + + # Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant) + deltas = [] + for nid, n in notes_by_id.items(): + expected = n["wikilink_count"] + actual = q["per_note_refs"].get(nid, 0) + if expected != actual: + deltas.append({ + "note_id": nid, + "title": n["title"], + "path": n["path"], + "wikilinks_in_vault": expected, + "references_in_qdrant": actual, + "delta": actual - expected + }) + + out = { + "vault": { + "notes_with_required_frontmatter": len(notes), + "wikilink_occurrences_total": wikilink_total + }, + "qdrant": { + "collections": q["collections"], + "counts": q["counts"], + "edge_kinds": q["kinds"], + "unresolved_by_kind": q["unresolved"] + }, + "mismatch_notes": deltas[:50], # nur erste 50 ausgeben + "mismatch_total": len(deltas) + } + print(json.dumps(out, ensure_ascii=False, indent=2)) + +if __name__ == "__main__": + main()