#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Script: audit_edges_vs_expectations.py — Prüfe Kanten in Qdrant gegen Vault-Erwartungen Version: 1.0.0 Datum: 2025-09-09 Zweck ----- - Liest Edges/Chunks/Notes aus Qdrant. - Ermittelt erwartete Kanten-Anzahlen aus dem Vault: * belongs_to : sollte == #Chunks * next / prev : je Note (#Chunks_in_Note - 1) * references : Summe aller Chunk-Wikilinks * backlink : Summe einzigartiger Wikilinks pro Note (Note-Level) - Vergleicht IST vs. SOLL und meldet Abweichungen. ENV/Qdrant ---------- QDRANT_URL, QDRANT_API_KEY (optional), COLLECTION_PREFIX (Default: mindnet) Aufrufe ------- # Gesamtaudit python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault # Mit anderem Prefix python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --prefix mindnet_dev # Details anzeigen python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --details """ from __future__ import annotations import argparse import json import os import re from collections import defaultdict, Counter from typing import Dict, List, Tuple from qdrant_client import QdrantClient from qdrant_client.http import models as rest from dotenv import load_dotenv # Projektmodule – nur leichtgewichtige Funktionen try: from app.core.parser import read_markdown except Exception: # sehr einfacher Fallback für Wikilinks read_markdown = None WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") # ------------------------------ # Qdrant Helpers # ------------------------------ def _names(prefix: str) -> Tuple[str, str, str]: return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" def _scroll_all(client: QdrantClient, col: str, flt=None, with_payload=True, with_vectors=False, limit=256): out, next_page = [], None while True: pts, next_page = client.scroll( collection_name=col, scroll_filter=flt, with_payload=with_payload, with_vectors=with_vectors, limit=limit, offset=next_page, ) if not pts: break out.extend(pts) if not next_page: break return out # ------------------------------ # Vault scan # ------------------------------ def _iter_md(root: str) -> List[str]: out: List[str] = [] for dp, _, fns in os.walk(root): for fn in fns: if fn.lower().endswith(".md"): p = os.path.join(dp, fn) if "/.obsidian/" in p.replace("\\", "/"): continue out.append(p) return sorted(out) def _wikilinks_in_text(text: str) -> List[str]: return WIKILINK_RE.findall(text or "") def _wikilinks_per_note(vault_root: str) -> Dict[str, List[str]]: res: Dict[str, List[str]] = {} for p in _iter_md(vault_root): body = "" try: if read_markdown: parsed = read_markdown(p) body = parsed.body or "" fm = parsed.frontmatter or {} nid = fm.get("id") or fm.get("note_id") or os.path.splitext(os.path.basename(p))[0] else: with open(p, "r", encoding="utf-8") as f: txt = f.read() # sehr einfacher Split: YAML-Frontmatter rausnehmen if txt.lstrip().startswith("---"): parts = txt.split("\n---", 1) body = parts[1] if len(parts) > 1 else txt else: body = txt nid = os.path.splitext(os.path.basename(p))[0] res[nid] = _wikilinks_in_text(body) except Exception: continue return res # ------------------------------ # Main Audit # ------------------------------ def main(): load_dotenv() ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True, help="Pfad zum Vault (für Erwartungswerte)") ap.add_argument("--prefix", default=os.environ.get("COLLECTION_PREFIX", "mindnet"), help="Collection-Prefix") ap.add_argument("--details", action="store_true", help="Detail-Listen ausgeben") args = ap.parse_args() client = QdrantClient(url=os.environ.get("QDRANT_URL", "http://127.0.0.1:6333"), api_key=os.environ.get("QDRANT_API_KEY") or None) notes_col, chunks_col, edges_col = _names(args.prefix) # Qdrant laden notes = _scroll_all(client, notes_col, with_payload=True, with_vectors=False) chunks = _scroll_all(client, chunks_col, with_payload=True, with_vectors=False) edges = _scroll_all(client, edges_col, with_payload=True, with_vectors=False) # --- Ist-Zähler cnt_kind = Counter() cnt_scope = Counter() by_note_chunks: Dict[str, int] = defaultdict(int) chunk_wikilinks_total = 0 for p in chunks: pl = p.payload or {} by_note_chunks[pl.get("note_id")] += 1 wl = pl.get("wikilinks") or [] if isinstance(wl, list): chunk_wikilinks_total += len(wl) for p in edges: pl = p.payload or {} kind = pl.get("kind") or pl.get("edge_type") or "?" scope = pl.get("scope") or "?" cnt_kind[kind] += 1 cnt_scope[f"{kind}:{scope}"] += 1 total_chunks = sum(by_note_chunks.values()) # --- Soll-Zähler aus Vault wl_per_note = _wikilinks_per_note(args.vault) backlink_expected = sum(len(set(v)) for v in wl_per_note.values()) next_expected = sum(max(c - 1, 0) for c in by_note_chunks.values()) prev_expected = next_expected # symmetrische Kanten belongs_to_expected = total_chunks references_expected = chunk_wikilinks_total # aus Chunk-Payloads # --- Ergebnis result = { "qdrant_counts": dict(cnt_kind), "qdrant_counts_by_scope": dict(cnt_scope), "chunks_total": total_chunks, "by_note_chunks": dict(by_note_chunks), "vault_expected": { "belongs_to": belongs_to_expected, "next": next_expected, "prev": prev_expected, "references": references_expected, "backlink": backlink_expected, }, "deltas": { "belongs_to": cnt_kind.get("belongs_to", 0) - belongs_to_expected, "next": cnt_kind.get("next", 0) - next_expected, "prev": cnt_kind.get("prev", 0) - prev_expected, "references": cnt_kind.get("references", 0) - references_expected, "backlink": cnt_kind.get("backlink", 0) - backlink_expected, }, "collections": { "notes": notes_col, "chunks": chunks_col, "edges": edges_col } } print(json.dumps(result, ensure_ascii=False, indent=2)) if args.details: # optionale Stichproben (z. B. fehlerhafte Kantenarten) pass if __name__ == "__main__": main()