From 41d43c2bb638a45cfa7eab828ff9d96ab63cc907 Mon Sep 17 00:00:00 2001
From: Lars <Lars@stommer.de>
Date: Fri, 5 Sep 2025 08:52:23 +0200
Subject: [PATCH] =?UTF-8?q?scripts/audit=5Fvault=5Fvs=5Fqdrant.py=20hinzug?=
 =?UTF-8?q?ef=C3=BCgt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/audit_vault_vs_qdrant.py | 201 +++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 scripts/audit_vault_vs_qdrant.py

diff --git a/scripts/audit_vault_vs_qdrant.py b/scripts/audit_vault_vs_qdrant.py
new file mode 100644
index 0000000..b295b3c
--- /dev/null
+++ b/scripts/audit_vault_vs_qdrant.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Name:        scripts/audit_vault_vs_qdrant.py
+Version:     v1.0.0 (2025-09-05)
+Kurzbeschreibung:
+  Prüft die Konsistenz zwischen Obsidian-Vault und Qdrant:
+  - Zählt Markdown-Dateien mit gültiger Frontmatter (title, id, type, status, created).
+  - Zählt Wikilink-Vorkommen im Vault (regex wie in derive_edges.py).
+  - Liest Zählungen aus Qdrant (Notes/Chunks/Edges je kind).
+  - Vergleicht erwartete Wikilink-Anzahl (Vault) vs. tatsächlich importierte Edges (Qdrant).
+  - Listet Auffälligkeiten pro Note (z. B. Wikilinks im Vault, aber keine references in Qdrant).
+
+Aufruf (aus Projekt-Root, im venv):
+  python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet
+
+Parameter:
+  --vault     Pfad zum Vault (z. B. ./vault)
+  --prefix    Collection-Prefix in Qdrant (Default: mindnet)
+  --limit     Max. Punkte pro Scroll-Seite aus Qdrant (Default: 1000)
+
+Voraussetzungen:
+  - Aktives Python venv mit installiertem qdrant-client.
+  - Zugriff auf Qdrant per ENV (QDRANT_URL, QDRANT_API_KEY optional).
+
+Hinweise:
+  - Der Wikilink-Regex entspricht dem in app/core/derive_edges.py verwendeten Muster.  (Quelle: derive_edges.py)  # :contentReference[oaicite:3]{index=3}
+  - Pflicht-Frontmatter wird wie in app/core/parser.py geprüft.                                (Quelle: parser.py)        # :contentReference[oaicite:4]{index=4}
+  - Collection-Namen & 1D-Edge-Vektoren folgen app/core/qdrant.py / qdrant_points.py.          (Quellen: qdrant.py, qdrant_points.py)  # 
+
+Changelog:
+  v1.0.0: Erste Version.
+
+Autor:
+  mindnet – Datenimporte & Sync
+"""
+from __future__ import annotations
+import argparse, os, glob, re, json
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple, Optional
+
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as rest
+
+# --- Regex wie in derive_edges.py (Wikilinks)
+WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]")  # :contentReference[oaicite:6]{index=6}
+
+# --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik)  :contentReference[oaicite:7]{index=7}
+REQUIRED = ("title","id","type","status","created")
+
+def has_required_frontmatter(front: Dict) -> bool:
+    return all(k in front and front[k] not in (None, "") for k in REQUIRED)
+
+def read_front_and_body(path: str) -> Tuple[Dict, str]:
+    # Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden)
+    import yaml, unicodedata
+    with open(path, "r", encoding="utf-8") as f:
+        raw = f.read().lstrip("\ufeff")
+    raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n")
+    if raw.startswith("---\n"):
+        end = raw.find("\n---", 4)
+        if end != -1:
+            fm_raw = raw[4:end].strip()
+            body = raw[end+4:].lstrip("\n")
+            try:
+                fm = yaml.safe_load(fm_raw) or {}
+                if not isinstance(fm, dict):
+                    fm = {}
+            except Exception:
+                fm = {}
+            return fm, body
+    return {}, raw
+
+def slug_file(path: str) -> str:
+    import unicodedata
+    s = os.path.basename(path)
+    if s.endswith(".md"): s = s[:-3]
+    s = unicodedata.normalize("NFKD", s)
+    s = "".join(ch for ch in s if not unicodedata.combining(ch))
+    s = s.lower().replace(" ", "-")
+    s = re.sub(r"[^a-z0-9\-]+", "", s)
+    s = re.sub(r"-{2,}", "-", s).strip("-")
+    return s
+
+def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]:
+    files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)]
+    notes: List[Dict] = []
+    wikilink_total = 0
+    for p in files:
+        pn = p.replace("\\","/")
+        if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
+            continue
+        fm, body = read_front_and_body(p)
+        if not has_required_frontmatter(fm):
+            continue
+        nid = fm.get("id")
+        title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0]
+        relpath = os.path.relpath(p, vault_root).replace("\\","/")
+        links = list(WIKILINK_RE.finditer(body))
+        wikilink_total += len(links)
+        notes.append({
+            "note_id": nid,
+            "title": title,
+            "path": relpath,
+            "wikilink_count": len(links),
+            "file_slug": slug_file(p),
+        })
+    return notes, wikilink_total
+
+def qdrant_client_from_env() -> QdrantClient:
+    import os
+    url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
+    api_key = os.getenv("QDRANT_API_KEY") or None
+    return QdrantClient(url=url, api_key=api_key)
+
+def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000):
+    next_offset = None
+    while True:
+        pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset)
+        for p in pts:
+            yield p
+        if next_offset is None:
+            break
+
+def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict:
+    client = qdrant_client_from_env()
+    cols = {
+        "notes": f"{prefix}_notes",
+        "chunks": f"{prefix}_chunks",
+        "edges": f"{prefix}_edges",
+    }
+    counts = {}
+    for k, c in cols.items():
+        n = 0
+        for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit):
+            n += 1
+        counts[k] = n
+
+    # Edge-Kinds & unresolved zählen
+    kinds = Counter()
+    unresolved = Counter()
+    per_note_refs = defaultdict(int)
+    for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit):
+        pl = p.payload or {}
+        k = pl.get("kind")
+        if k: kinds[k] += 1
+        if pl.get("status") == "unresolved":
+            unresolved[k] += 1
+        # für per-Note-Vergleich: references (Volltext) zählen
+        if k == "references":
+            src = pl.get("source_id")
+            if src:
+                per_note_refs[src] += 1
+
+    return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)}
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)")
+    ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix")
+    ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite")
+    args = ap.parse_args()
+
+    notes, wikilink_total = collect_vault_stats(args.vault)
+    q = collect_qdrant_stats(args.prefix, args.limit)
+
+    notes_by_id = {n["note_id"]: n for n in notes}
+
+    # Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant)
+    deltas = []
+    for nid, n in notes_by_id.items():
+        expected = n["wikilink_count"]
+        actual = q["per_note_refs"].get(nid, 0)
+        if expected != actual:
+            deltas.append({
+                "note_id": nid,
+                "title": n["title"],
+                "path": n["path"],
+                "wikilinks_in_vault": expected,
+                "references_in_qdrant": actual,
+                "delta": actual - expected
+            })
+
+    out = {
+        "vault": {
+            "notes_with_required_frontmatter": len(notes),
+            "wikilink_occurrences_total": wikilink_total
+        },
+        "qdrant": {
+            "collections": q["collections"],
+            "counts": q["counts"],
+            "edge_kinds": q["kinds"],
+            "unresolved_by_kind": q["unresolved"]
+        },
+        "mismatch_notes": deltas[:50],  # nur erste 50 ausgeben
+        "mismatch_total": len(deltas)
+    }
+    print(json.dumps(out, ensure_ascii=False, indent=2))
+
+if __name__ == "__main__":
+    main()