scripts/audit_vault_vs_qdrant.py hinzugefügt
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
cb30dbb23c
commit
41d43c2bb6
201
scripts/audit_vault_vs_qdrant.py
Normal file
201
scripts/audit_vault_vs_qdrant.py
Normal file
|
|
@ -0,0 +1,201 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Name: scripts/audit_vault_vs_qdrant.py
|
||||||
|
Version: v1.0.0 (2025-09-05)
|
||||||
|
Kurzbeschreibung:
|
||||||
|
Prüft die Konsistenz zwischen Obsidian-Vault und Qdrant:
|
||||||
|
- Zählt Markdown-Dateien mit gültiger Frontmatter (title, id, type, status, created).
|
||||||
|
- Zählt Wikilink-Vorkommen im Vault (regex wie in derive_edges.py).
|
||||||
|
- Liest Zählungen aus Qdrant (Notes/Chunks/Edges je kind).
|
||||||
|
- Vergleicht erwartete Wikilink-Anzahl (Vault) vs. tatsächlich importierte Edges (Qdrant).
|
||||||
|
- Listet Auffälligkeiten pro Note (z. B. Wikilinks im Vault, aber keine references in Qdrant).
|
||||||
|
|
||||||
|
Aufruf (aus Projekt-Root, im venv):
|
||||||
|
python3 -m scripts.audit_vault_vs_qdrant --vault ./vault --prefix mindnet
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
--vault Pfad zum Vault (z. B. ./vault)
|
||||||
|
--prefix Collection-Prefix in Qdrant (Default: mindnet)
|
||||||
|
--limit Max. Punkte pro Scroll-Seite aus Qdrant (Default: 1000)
|
||||||
|
|
||||||
|
Voraussetzungen:
|
||||||
|
- Aktives Python venv mit installiertem qdrant-client.
|
||||||
|
- Zugriff auf Qdrant per ENV (QDRANT_URL, QDRANT_API_KEY optional).
|
||||||
|
|
||||||
|
Hinweise:
|
||||||
|
- Der Wikilink-Regex entspricht dem in app/core/derive_edges.py verwendeten Muster. (Quelle: derive_edges.py) # :contentReference[oaicite:3]{index=3}
|
||||||
|
- Pflicht-Frontmatter wird wie in app/core/parser.py geprüft. (Quelle: parser.py) # :contentReference[oaicite:4]{index=4}
|
||||||
|
- Collection-Namen & 1D-Edge-Vektoren folgen app/core/qdrant.py / qdrant_points.py. (Quellen: qdrant.py, qdrant_points.py) #
|
||||||
|
|
||||||
|
Changelog:
|
||||||
|
v1.0.0: Erste Version.
|
||||||
|
|
||||||
|
Autor:
|
||||||
|
mindnet – Datenimporte & Sync
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse, os, glob, re, json
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
|
# --- Regex wie in derive_edges.py (Wikilinks)
|
||||||
|
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]") # :contentReference[oaicite:6]{index=6}
|
||||||
|
|
||||||
|
# --- Frontmatter-Prüfung wie in parser.validate_required_frontmatter (vereinfachte Replik) :contentReference[oaicite:7]{index=7}
|
||||||
|
REQUIRED = ("title","id","type","status","created")
|
||||||
|
|
||||||
|
def has_required_frontmatter(front: Dict) -> bool:
|
||||||
|
return all(k in front and front[k] not in (None, "") for k in REQUIRED)
|
||||||
|
|
||||||
|
def read_front_and_body(path: str) -> Tuple[Dict, str]:
|
||||||
|
# Minimal-Parser wie parser.read_markdown (kein YAML-Import hier, um Abh. zu vermeiden)
|
||||||
|
import yaml, unicodedata
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
raw = f.read().lstrip("\ufeff")
|
||||||
|
raw = unicodedata.normalize("NFKC", raw).replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
if raw.startswith("---\n"):
|
||||||
|
end = raw.find("\n---", 4)
|
||||||
|
if end != -1:
|
||||||
|
fm_raw = raw[4:end].strip()
|
||||||
|
body = raw[end+4:].lstrip("\n")
|
||||||
|
try:
|
||||||
|
fm = yaml.safe_load(fm_raw) or {}
|
||||||
|
if not isinstance(fm, dict):
|
||||||
|
fm = {}
|
||||||
|
except Exception:
|
||||||
|
fm = {}
|
||||||
|
return fm, body
|
||||||
|
return {}, raw
|
||||||
|
|
||||||
|
def slug_file(path: str) -> str:
|
||||||
|
import unicodedata
|
||||||
|
s = os.path.basename(path)
|
||||||
|
if s.endswith(".md"): s = s[:-3]
|
||||||
|
s = unicodedata.normalize("NFKD", s)
|
||||||
|
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||||
|
s = s.lower().replace(" ", "-")
|
||||||
|
s = re.sub(r"[^a-z0-9\-]+", "", s)
|
||||||
|
s = re.sub(r"-{2,}", "-", s).strip("-")
|
||||||
|
return s
|
||||||
|
|
||||||
|
def collect_vault_stats(vault_root: str) -> Tuple[List[Dict], int]:
|
||||||
|
files = [p for p in glob.glob(os.path.join(vault_root, "**", "*.md"), recursive=True)]
|
||||||
|
notes: List[Dict] = []
|
||||||
|
wikilink_total = 0
|
||||||
|
for p in files:
|
||||||
|
pn = p.replace("\\","/")
|
||||||
|
if any(ex in pn for ex in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
|
||||||
|
continue
|
||||||
|
fm, body = read_front_and_body(p)
|
||||||
|
if not has_required_frontmatter(fm):
|
||||||
|
continue
|
||||||
|
nid = fm.get("id")
|
||||||
|
title = fm.get("title") or os.path.basename(p).rsplit(".",1)[0]
|
||||||
|
relpath = os.path.relpath(p, vault_root).replace("\\","/")
|
||||||
|
links = list(WIKILINK_RE.finditer(body))
|
||||||
|
wikilink_total += len(links)
|
||||||
|
notes.append({
|
||||||
|
"note_id": nid,
|
||||||
|
"title": title,
|
||||||
|
"path": relpath,
|
||||||
|
"wikilink_count": len(links),
|
||||||
|
"file_slug": slug_file(p),
|
||||||
|
})
|
||||||
|
return notes, wikilink_total
|
||||||
|
|
||||||
|
def qdrant_client_from_env() -> QdrantClient:
|
||||||
|
import os
|
||||||
|
url = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
|
||||||
|
api_key = os.getenv("QDRANT_API_KEY") or None
|
||||||
|
return QdrantClient(url=url, api_key=api_key)
|
||||||
|
|
||||||
|
def scroll_all(client: QdrantClient, collection: str, with_payload=True, limit=1000):
|
||||||
|
next_offset = None
|
||||||
|
while True:
|
||||||
|
pts, next_offset = client.scroll(collection_name=collection, with_payload=with_payload, with_vectors=False, limit=limit, offset=next_offset)
|
||||||
|
for p in pts:
|
||||||
|
yield p
|
||||||
|
if next_offset is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
def collect_qdrant_stats(prefix: str, limit: int=1000) -> Dict:
|
||||||
|
client = qdrant_client_from_env()
|
||||||
|
cols = {
|
||||||
|
"notes": f"{prefix}_notes",
|
||||||
|
"chunks": f"{prefix}_chunks",
|
||||||
|
"edges": f"{prefix}_edges",
|
||||||
|
}
|
||||||
|
counts = {}
|
||||||
|
for k, c in cols.items():
|
||||||
|
n = 0
|
||||||
|
for _ in scroll_all(client, c, with_payload=(k!="chunks"), limit=limit):
|
||||||
|
n += 1
|
||||||
|
counts[k] = n
|
||||||
|
|
||||||
|
# Edge-Kinds & unresolved zählen
|
||||||
|
kinds = Counter()
|
||||||
|
unresolved = Counter()
|
||||||
|
per_note_refs = defaultdict(int)
|
||||||
|
for p in scroll_all(client, cols["edges"], with_payload=True, limit=limit):
|
||||||
|
pl = p.payload or {}
|
||||||
|
k = pl.get("kind")
|
||||||
|
if k: kinds[k] += 1
|
||||||
|
if pl.get("status") == "unresolved":
|
||||||
|
unresolved[k] += 1
|
||||||
|
# für per-Note-Vergleich: references (Volltext) zählen
|
||||||
|
if k == "references":
|
||||||
|
src = pl.get("source_id")
|
||||||
|
if src:
|
||||||
|
per_note_refs[src] += 1
|
||||||
|
|
||||||
|
return {"collections": cols, "counts": counts, "kinds": kinds, "unresolved": unresolved, "per_note_refs": dict(per_note_refs)}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--vault", required=True, help="Pfad zum Vault (z. B. ./vault)")
|
||||||
|
ap.add_argument("--prefix", default="mindnet", help="Qdrant Collection-Prefix")
|
||||||
|
ap.add_argument("--limit", type=int, default=1000, help="Scroll-Limit je Seite")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
notes, wikilink_total = collect_vault_stats(args.vault)
|
||||||
|
q = collect_qdrant_stats(args.prefix, args.limit)
|
||||||
|
|
||||||
|
notes_by_id = {n["note_id"]: n for n in notes}
|
||||||
|
|
||||||
|
# Abgleich pro Note: erwartete Wikilinks (Vault) vs. tatsächliche references (Qdrant)
|
||||||
|
deltas = []
|
||||||
|
for nid, n in notes_by_id.items():
|
||||||
|
expected = n["wikilink_count"]
|
||||||
|
actual = q["per_note_refs"].get(nid, 0)
|
||||||
|
if expected != actual:
|
||||||
|
deltas.append({
|
||||||
|
"note_id": nid,
|
||||||
|
"title": n["title"],
|
||||||
|
"path": n["path"],
|
||||||
|
"wikilinks_in_vault": expected,
|
||||||
|
"references_in_qdrant": actual,
|
||||||
|
"delta": actual - expected
|
||||||
|
})
|
||||||
|
|
||||||
|
out = {
|
||||||
|
"vault": {
|
||||||
|
"notes_with_required_frontmatter": len(notes),
|
||||||
|
"wikilink_occurrences_total": wikilink_total
|
||||||
|
},
|
||||||
|
"qdrant": {
|
||||||
|
"collections": q["collections"],
|
||||||
|
"counts": q["counts"],
|
||||||
|
"edge_kinds": q["kinds"],
|
||||||
|
"unresolved_by_kind": q["unresolved"]
|
||||||
|
},
|
||||||
|
"mismatch_notes": deltas[:50], # nur erste 50 ausgeben
|
||||||
|
"mismatch_total": len(deltas)
|
||||||
|
}
|
||||||
|
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user