All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
241 lines
7.5 KiB
Python
241 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
FILE: scripts/audit_edges_vs_expectations.py
|
||
VERSION: 2.1.0 (2025-12-15)
|
||
STATUS: Active
|
||
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
|
||
|
||
Zweck:
|
||
-------
|
||
Prüft Edge-Anzahlen in Qdrant gegen erwartete Werte aus dem Vault.
|
||
Validiert strukturelle Integrität des Graphen.
|
||
|
||
Funktionsweise:
|
||
---------------
|
||
1. Liest Edges/Chunks/Notes aus Qdrant
|
||
2. Ermittelt erwartete Edge-Anzahlen aus Vault:
|
||
- belongs_to: sollte == #Chunks pro Note
|
||
- next/prev: je Note (#Chunks - 1)
|
||
- references: Summe aller Chunk-Wikilinks
|
||
- backlink: Summe einzigartiger Wikilinks (Note-Level)
|
||
3. Vergleicht IST vs. SOLL
|
||
4. Meldet Abweichungen
|
||
|
||
Ergebnis-Interpretation:
|
||
------------------------
|
||
- Ausgabe: JSON mit Vergleichs-Ergebnissen
|
||
* expected: Erwartete Edge-Anzahlen
|
||
* actual: Tatsächliche Edge-Anzahlen
|
||
* discrepancies: Abweichungen
|
||
* per_note: Details pro Note (mit --details)
|
||
- Exit-Code 0: Erfolgreich
|
||
|
||
Verwendung:
|
||
-----------
|
||
- Validierung nach Importen
|
||
- Debugging von Edge-Problemen
|
||
- Qualitätskontrolle
|
||
|
||
Hinweise:
|
||
---------
|
||
- Prüft strukturelle, nicht semantische Korrektheit
|
||
- Kann bei großen Vaults langsam sein
|
||
|
||
Aufruf:
|
||
-------
|
||
python3 -m scripts.audit_edges_vs_expectations --vault ./vault
|
||
python3 -m scripts.audit_edges_vs_expectations --vault ./vault --prefix mindnet_dev --details
|
||
|
||
Parameter:
|
||
----------
|
||
--vault PATH Pfad zum Vault-Verzeichnis (erforderlich)
|
||
--prefix TEXT Collection-Präfix (Default: mindnet)
|
||
--details Zeigt Details pro Note
|
||
|
||
Umgebungsvariablen:
|
||
-------------------
|
||
QDRANT_URL, QDRANT_API_KEY (optional), COLLECTION_PREFIX
|
||
|
||
Änderungen:
|
||
-----------
|
||
v2.1.0 (2025-12-15): Dokumentation aktualisiert
|
||
v1.0.0 (2025-09-09): Initial Release
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
import argparse
|
||
import json
|
||
import os
|
||
import re
|
||
from collections import defaultdict, Counter
|
||
from typing import Dict, List, Tuple
|
||
|
||
from qdrant_client import QdrantClient
|
||
from qdrant_client.http import models as rest
|
||
from dotenv import load_dotenv
|
||
|
||
# Projektmodule – nur leichtgewichtige Funktionen
|
||
try:
|
||
from app.core.parser import read_markdown
|
||
except Exception:
|
||
# sehr einfacher Fallback für Wikilinks
|
||
read_markdown = None
|
||
|
||
WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
||
|
||
# ------------------------------
|
||
# Qdrant Helpers
|
||
# ------------------------------
|
||
|
||
def _names(prefix: str) -> Tuple[str, str, str]:
|
||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||
|
||
def _scroll_all(client: QdrantClient, col: str, flt=None, with_payload=True, with_vectors=False, limit=256):
|
||
out, next_page = [], None
|
||
while True:
|
||
pts, next_page = client.scroll(
|
||
collection_name=col,
|
||
scroll_filter=flt,
|
||
with_payload=with_payload,
|
||
with_vectors=with_vectors,
|
||
limit=limit,
|
||
offset=next_page,
|
||
)
|
||
if not pts:
|
||
break
|
||
out.extend(pts)
|
||
if not next_page:
|
||
break
|
||
return out
|
||
|
||
# ------------------------------
|
||
# Vault scan
|
||
# ------------------------------
|
||
|
||
def _iter_md(root: str) -> List[str]:
|
||
out: List[str] = []
|
||
for dp, _, fns in os.walk(root):
|
||
for fn in fns:
|
||
if fn.lower().endswith(".md"):
|
||
p = os.path.join(dp, fn)
|
||
if "/.obsidian/" in p.replace("\\", "/"):
|
||
continue
|
||
out.append(p)
|
||
return sorted(out)
|
||
|
||
def _wikilinks_in_text(text: str) -> List[str]:
|
||
return WIKILINK_RE.findall(text or "")
|
||
|
||
def _wikilinks_per_note(vault_root: str) -> Dict[str, List[str]]:
|
||
res: Dict[str, List[str]] = {}
|
||
for p in _iter_md(vault_root):
|
||
body = ""
|
||
try:
|
||
if read_markdown:
|
||
parsed = read_markdown(p)
|
||
body = parsed.body or ""
|
||
fm = parsed.frontmatter or {}
|
||
nid = fm.get("id") or fm.get("note_id") or os.path.splitext(os.path.basename(p))[0]
|
||
else:
|
||
with open(p, "r", encoding="utf-8") as f:
|
||
txt = f.read()
|
||
# sehr einfacher Split: YAML-Frontmatter rausnehmen
|
||
if txt.lstrip().startswith("---"):
|
||
parts = txt.split("\n---", 1)
|
||
body = parts[1] if len(parts) > 1 else txt
|
||
else:
|
||
body = txt
|
||
nid = os.path.splitext(os.path.basename(p))[0]
|
||
res[nid] = _wikilinks_in_text(body)
|
||
except Exception:
|
||
continue
|
||
return res
|
||
|
||
# ------------------------------
|
||
# Main Audit
|
||
# ------------------------------
|
||
|
||
def main():
|
||
load_dotenv()
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--vault", required=True, help="Pfad zum Vault (für Erwartungswerte)")
|
||
ap.add_argument("--prefix", default=os.environ.get("COLLECTION_PREFIX", "mindnet"), help="Collection-Prefix")
|
||
ap.add_argument("--details", action="store_true", help="Detail-Listen ausgeben")
|
||
args = ap.parse_args()
|
||
|
||
client = QdrantClient(url=os.environ.get("QDRANT_URL", "http://127.0.0.1:6333"),
|
||
api_key=os.environ.get("QDRANT_API_KEY") or None)
|
||
notes_col, chunks_col, edges_col = _names(args.prefix)
|
||
|
||
# Qdrant laden
|
||
notes = _scroll_all(client, notes_col, with_payload=True, with_vectors=False)
|
||
chunks = _scroll_all(client, chunks_col, with_payload=True, with_vectors=False)
|
||
edges = _scroll_all(client, edges_col, with_payload=True, with_vectors=False)
|
||
|
||
# --- Ist-Zähler
|
||
cnt_kind = Counter()
|
||
cnt_scope = Counter()
|
||
by_note_chunks: Dict[str, int] = defaultdict(int)
|
||
chunk_wikilinks_total = 0
|
||
|
||
for p in chunks:
|
||
pl = p.payload or {}
|
||
by_note_chunks[pl.get("note_id")] += 1
|
||
wl = pl.get("wikilinks") or []
|
||
if isinstance(wl, list):
|
||
chunk_wikilinks_total += len(wl)
|
||
|
||
for p in edges:
|
||
pl = p.payload or {}
|
||
kind = pl.get("kind") or pl.get("edge_type") or "?"
|
||
scope = pl.get("scope") or "?"
|
||
cnt_kind[kind] += 1
|
||
cnt_scope[f"{kind}:{scope}"] += 1
|
||
|
||
total_chunks = sum(by_note_chunks.values())
|
||
|
||
# --- Soll-Zähler aus Vault
|
||
wl_per_note = _wikilinks_per_note(args.vault)
|
||
backlink_expected = sum(len(set(v)) for v in wl_per_note.values())
|
||
|
||
next_expected = sum(max(c - 1, 0) for c in by_note_chunks.values())
|
||
prev_expected = next_expected # symmetrische Kanten
|
||
|
||
belongs_to_expected = total_chunks
|
||
references_expected = chunk_wikilinks_total # aus Chunk-Payloads
|
||
|
||
# --- Ergebnis
|
||
result = {
|
||
"qdrant_counts": dict(cnt_kind),
|
||
"qdrant_counts_by_scope": dict(cnt_scope),
|
||
"chunks_total": total_chunks,
|
||
"by_note_chunks": dict(by_note_chunks),
|
||
"vault_expected": {
|
||
"belongs_to": belongs_to_expected,
|
||
"next": next_expected,
|
||
"prev": prev_expected,
|
||
"references": references_expected,
|
||
"backlink": backlink_expected,
|
||
},
|
||
"deltas": {
|
||
"belongs_to": cnt_kind.get("belongs_to", 0) - belongs_to_expected,
|
||
"next": cnt_kind.get("next", 0) - next_expected,
|
||
"prev": cnt_kind.get("prev", 0) - prev_expected,
|
||
"references": cnt_kind.get("references", 0) - references_expected,
|
||
"backlink": cnt_kind.get("backlink", 0) - backlink_expected,
|
||
},
|
||
"collections": {
|
||
"notes": notes_col, "chunks": chunks_col, "edges": edges_col
|
||
}
|
||
}
|
||
|
||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||
|
||
if args.details:
|
||
# optionale Stichproben (z. B. fehlerhafte Kantenarten)
|
||
pass
|
||
|
||
if __name__ == "__main__":
|
||
main()
|