tests/verify_chunks_integrity.py hinzugefügt

2025-09-30 12:05:13 +02:00 · 2025-09-30 12:05:13 +02:00 · 589d021744
commit 589d021744
parent e58b9c8ada
1 changed files with 197 additions and 0 deletions
--- a/tests/verify_chunks_integrity.py
+++ b/tests/verify_chunks_integrity.py
@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Script: tests/verify_chunks_integrity.py
+Version: 1.0.0
+Datum: 2025-09-10
+
+Zweck
+-----
+Verifiziert die Text-Integrität der gespeicherten Chunks:
+  1) Rekonstruiert den Body aus den Chunks (Sortierung: seq → chunk_index → # in chunk_id).
+  2) Vergleicht mit dem in Qdrant gespeicherten Note-`fulltext` (falls vorhanden).
+  3) Optional: Vergleicht zusätzlich mit dem Body der zugehörigen Markdown-Datei im Vault.
+
+Output
+------
+- Pro Note eine JSON-Zeile mit:
+    note_id, chunks_count, chunks_with_text, match_fulltext, match_vault (optional), issues
+- Zum Schluss eine Summary mit Counts und Exit-Code 1 bei Fehlern (falls --fail-on-mismatch gesetzt).
+
+Aufrufe
+-------
+    # Nur gegen Qdrant (fulltext vs. Chunks)
+    python3 tests/verify_chunks_integrity.py --prefix mindnet
+
+    # Zusätzlich gegen den Vault abgleichen (Body der .md-Datei)
+    python3 tests/verify_chunks_integrity.py --prefix mindnet --vault ./test_vault
+
+    # Whitespace-tolerant (Default): Trim/Normalisierung ein
+    python3 tests/verify_chunks_integrity.py --prefix mindnet --vault ./test_vault
+
+    # Strikt (kein Trimmen): 
+    python3 tests/verify_chunks_integrity.py --prefix mindnet --strict --fail-on-mismatch
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import yaml
+from qdrant_client.http import models as rest
+
+from app.core.qdrant import QdrantConfig, get_client
+
+# --------------------------- Helpers ---------------------------
+
+def collections(prefix: str) -> Tuple[str, str, str]:
+    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
+
+def scroll_all(client, collection: str, flt: Optional[rest.Filter] = None, limit: int = 256) -> List[Any]:
+    out: List[Any] = []
+    nextp = None
+    while True:
+        pts, nextp = client.scroll(
+            collection_name=collection,
+            with_payload=True,
+            with_vectors=False,
+            limit=limit,
+            offset=nextp,
+            scroll_filter=flt,
+        )
+        if not pts:
+            break
+        out.extend(pts)
+        if nextp is None:
+            break
+    return out
+
+def sort_key_for_chunk_payload(pl: Dict[str, Any]) -> Tuple[int, int, int]:
+    s = pl.get("seq") or 0
+    ci = pl.get("chunk_index") or 0
+    n = 0
+    cid = pl.get("chunk_id") or ""
+    if isinstance(cid, str) and "#" in cid:
+        try:
+            n = int(cid.rsplit("#", 1)[-1])
+        except Exception:
+            n = 0
+    return (int(s), int(ci), int(n))
+
+def reconstruct_from_chunks(chunks_points: List[Any]) -> Tuple[str, int, int]:
+    """Gibt (text, total_chunks, chunks_with_text) zurück."""
+    chunks_sorted = sorted(chunks_points, key=lambda p: sort_key_for_chunk_payload(p.payload or {}))
+    texts: List[str] = []
+    have = 0
+    for p in chunks_sorted:
+        pl = p.payload or {}
+        t = pl.get("text") or pl.get("content") or pl.get("raw") or ""
+        if isinstance(t, str) and t:
+            have += 1
+            texts.append(t)
+    return ("\n".join(texts).strip(), len(chunks_sorted), have)
+
+def normalize(s: str, strict: bool = False) -> str:
+    if s is None:
+        return ""
+    s = s.replace("\r\n", "\n").replace("\r", "\n")
+    if strict:
+        return s
+    # toleranter Vergleich: trim trailing WS, entferne überzählige Leerzeilen am Rand
+    lines = [ln.rstrip() for ln in s.strip().split("\n")]
+    return "\n".join(lines).strip()
+
+def read_vault_body(vault_root: str, rel_path: str) -> Optional[str]:
+    if not rel_path:
+        return None
+    p = os.path.join(vault_root, rel_path.replace("\\", "/").lstrip("/"))
+    if not os.path.exists(p):
+        return None
+    with open(p, "r", encoding="utf-8") as f:
+        s = f.read()
+    if s.startswith("---"):
+        try:
+            fm_txt, body = s.split("\n---\n", 1)
+        except Exception:
+            return s
+    else:
+        body = s
+    return body
+
+# --------------------------- Main ------------------------------
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
+    ap.add_argument("--vault", help="Optional: Vault-Wurzelordner für Abgleich gegen .md")
+    ap.add_argument("--strict", action="store_true", help="Strikter Vergleich (kein Trimmen/Normalisieren)")
+    ap.add_argument("--fail-on-mismatch", action="store_true", help="Exit 1, wenn ein Mismatch gefunden wurde")
+    args = ap.parse_args()
+
+    cfg = QdrantConfig.from_env()
+    if args.prefix:
+        cfg.prefix = args.prefix.strip()
+    client = get_client(cfg)
+
+    notes_col, chunks_col, _ = collections(cfg.prefix)
+    notes = scroll_all(client, notes_col)
+
+    mismatches = 0
+    total = 0
+
+    for n in notes:
+        total += 1
+        pl = n.payload or {}
+        nid = pl.get("note_id")
+        rel_path = (pl.get("path") or "").replace("\\", "/").lstrip("/")
+        fulltext = (pl.get("fulltext") or "").strip()
+
+        flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
+        chunks = scroll_all(client, chunks_col, flt)
+        recon, cnt, have = reconstruct_from_chunks(chunks)
+
+        norm_recon = normalize(recon, strict=args.strict)
+        norm_full = normalize(fulltext, strict=args.strict)
+
+        match_full = (bool(norm_full) and (norm_recon == norm_full))
+        match_vault = None
+
+        issues: List[str] = []
+        if cnt == 0:
+            issues.append("no_chunks")
+        if have < cnt:
+            issues.append(f"missing_text_in_chunks:{cnt-have}/{cnt}")
+        if norm_full == "":
+            issues.append("note_fulltext_empty")
+        if not match_full:
+            issues.append("reconstructed_vs_fulltext_mismatch")
+
+        if args.vault:
+            body = read_vault_body(args.vault, rel_path) or ""
+            norm_body = normalize(body, strict=args.strict)
+            match_vault = (norm_body == norm_recon)
+            if not match_vault:
+                issues.append("reconstructed_vs_vault_body_mismatch")
+
+        obj = {
+            "note_id": nid,
+            "path": rel_path,
+            "chunks_count": cnt,
+            "chunks_with_text": have,
+            "match_fulltext": match_full,
+            "match_vault": match_vault,
+            "issues": issues,
+        }
+        print(json.dumps(obj, ensure_ascii=False))
+        if ("reconstructed_vs_fulltext_mismatch" in issues) or ("reconstructed_vs_vault_body_mismatch" in issues):
+            mismatches += 1
+
+    summary = {"summary": "OK" if mismatches == 0 else "DIFFS", "mismatch_count": mismatches, "notes": total}
+    print(json.dumps(summary, ensure_ascii=False))
+    if args.fail_on_mismatch and mismatches:
+        raise SystemExit(1)
+
+if __name__ == "__main__":
+    main()