diff --git a/scripts/audit_edges_vs_expectations.py b/scripts/audit_edges_vs_expectations.py new file mode 100644 index 0000000..389ccea --- /dev/null +++ b/scripts/audit_edges_vs_expectations.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Script: audit_edges_vs_expectations.py — Prüfe Kanten in Qdrant gegen Vault-Erwartungen +Version: 1.0.0 +Datum: 2025-09-09 + +Zweck +----- +- Liest Edges/Chunks/Notes aus Qdrant. +- Ermittelt erwartete Kanten-Anzahlen aus dem Vault: + * belongs_to : sollte == #Chunks + * next / prev : je Note (#Chunks_in_Note - 1) + * references : Summe aller Chunk-Wikilinks + * backlink : Summe einzigartiger Wikilinks pro Note (Note-Level) +- Vergleicht IST vs. SOLL und meldet Abweichungen. + +ENV/Qdrant +---------- +QDRANT_URL, QDRANT_API_KEY (optional), COLLECTION_PREFIX (Default: mindnet) + +Aufrufe +------- + # Gesamtaudit + python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault + + # Mit anderem Prefix + python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --prefix mindnet_dev + + # Details anzeigen + python3 -m scripts.audit_edges_vs_expectations --vault ./test_vault --details +""" + +from __future__ import annotations +import argparse +import json +import os +import re +from collections import defaultdict, Counter +from typing import Dict, List, Tuple + +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest +from dotenv import load_dotenv + +# Projektmodule – nur leichtgewichtige Funktionen +try: + from app.core.parser import read_markdown +except Exception: + # sehr einfacher Fallback für Wikilinks + read_markdown = None + +WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") + +# ------------------------------ +# Qdrant Helpers +# ------------------------------ + +def _names(prefix: str) -> Tuple[str, str, str]: + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + +def _scroll_all(client: QdrantClient, col: str, flt=None, with_payload=True, with_vectors=False, limit=256): + out, next_page = [], None + while True: + pts, next_page = client.scroll( + collection_name=col, + scroll_filter=flt, + with_payload=with_payload, + with_vectors=with_vectors, + limit=limit, + offset=next_page, + ) + if not pts: + break + out.extend(pts) + if not next_page: + break + return out + +# ------------------------------ +# Vault scan +# ------------------------------ + +def _iter_md(root: str) -> List[str]: + out: List[str] = [] + for dp, _, fns in os.walk(root): + for fn in fns: + if fn.lower().endswith(".md"): + p = os.path.join(dp, fn) + if "/.obsidian/" in p.replace("\\", "/"): + continue + out.append(p) + return sorted(out) + +def _wikilinks_in_text(text: str) -> List[str]: + return WIKILINK_RE.findall(text or "") + +def _wikilinks_per_note(vault_root: str) -> Dict[str, List[str]]: + res: Dict[str, List[str]] = {} + for p in _iter_md(vault_root): + body = "" + try: + if read_markdown: + parsed = read_markdown(p) + body = parsed.body or "" + fm = parsed.frontmatter or {} + nid = fm.get("id") or fm.get("note_id") or os.path.splitext(os.path.basename(p))[0] + else: + with open(p, "r", encoding="utf-8") as f: + txt = f.read() + # sehr einfacher Split: YAML-Frontmatter rausnehmen + if txt.lstrip().startswith("---"): + parts = txt.split("\n---", 1) + body = parts[1] if len(parts) > 1 else txt + else: + body = txt + nid = os.path.splitext(os.path.basename(p))[0] + res[nid] = _wikilinks_in_text(body) + except Exception: + continue + return res + +# ------------------------------ +# Main Audit +# ------------------------------ + +def main(): + load_dotenv() + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True, help="Pfad zum Vault (für Erwartungswerte)") + ap.add_argument("--prefix", default=os.environ.get("COLLECTION_PREFIX", "mindnet"), help="Collection-Prefix") + ap.add_argument("--details", action="store_true", help="Detail-Listen ausgeben") + args = ap.parse_args() + + client = QdrantClient(url=os.environ.get("QDRANT_URL", "http://127.0.0.1:6333"), + api_key=os.environ.get("QDRANT_API_KEY") or None) + notes_col, chunks_col, edges_col = _names(args.prefix) + + # Qdrant laden + notes = _scroll_all(client, notes_col, with_payload=True, with_vectors=False) + chunks = _scroll_all(client, chunks_col, with_payload=True, with_vectors=False) + edges = _scroll_all(client, edges_col, with_payload=True, with_vectors=False) + + # --- Ist-Zähler + cnt_kind = Counter() + cnt_scope = Counter() + by_note_chunks: Dict[str, int] = defaultdict(int) + chunk_wikilinks_total = 0 + + for p in chunks: + pl = p.payload or {} + by_note_chunks[pl.get("note_id")] += 1 + wl = pl.get("wikilinks") or [] + if isinstance(wl, list): + chunk_wikilinks_total += len(wl) + + for p in edges: + pl = p.payload or {} + kind = pl.get("kind") or pl.get("edge_type") or "?" + scope = pl.get("scope") or "?" + cnt_kind[kind] += 1 + cnt_scope[f"{kind}:{scope}"] += 1 + + total_chunks = sum(by_note_chunks.values()) + + # --- Soll-Zähler aus Vault + wl_per_note = _wikilinks_per_note(args.vault) + backlink_expected = sum(len(set(v)) for v in wl_per_note.values()) + + next_expected = sum(max(c - 1, 0) for c in by_note_chunks.values()) + prev_expected = next_expected # symmetrische Kanten + + belongs_to_expected = total_chunks + references_expected = chunk_wikilinks_total # aus Chunk-Payloads + + # --- Ergebnis + result = { + "qdrant_counts": dict(cnt_kind), + "qdrant_counts_by_scope": dict(cnt_scope), + "chunks_total": total_chunks, + "by_note_chunks": dict(by_note_chunks), + "vault_expected": { + "belongs_to": belongs_to_expected, + "next": next_expected, + "prev": prev_expected, + "references": references_expected, + "backlink": backlink_expected, + }, + "deltas": { + "belongs_to": cnt_kind.get("belongs_to", 0) - belongs_to_expected, + "next": cnt_kind.get("next", 0) - next_expected, + "prev": cnt_kind.get("prev", 0) - prev_expected, + "references": cnt_kind.get("references", 0) - references_expected, + "backlink": cnt_kind.get("backlink", 0) - backlink_expected, + }, + "collections": { + "notes": notes_col, "chunks": chunks_col, "edges": edges_col + } + } + + print(json.dumps(result, ensure_ascii=False, indent=2)) + + if args.details: + # optionale Stichproben (z. B. fehlerhafte Kantenarten) + pass + +if __name__ == "__main__": + main()