From 589d021744d9ff612d143fdb2caa1bf86560281a Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 30 Sep 2025 12:05:13 +0200 Subject: [PATCH] =?UTF-8?q?tests/verify=5Fchunks=5Fintegrity.py=20hinzugef?= =?UTF-8?q?=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/verify_chunks_integrity.py | 197 +++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 tests/verify_chunks_integrity.py diff --git a/tests/verify_chunks_integrity.py b/tests/verify_chunks_integrity.py new file mode 100644 index 0000000..3f10665 --- /dev/null +++ b/tests/verify_chunks_integrity.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Script: tests/verify_chunks_integrity.py +Version: 1.0.0 +Datum: 2025-09-10 + +Zweck +----- +Verifiziert die Text-Integrität der gespeicherten Chunks: + 1) Rekonstruiert den Body aus den Chunks (Sortierung: seq → chunk_index → # in chunk_id). + 2) Vergleicht mit dem in Qdrant gespeicherten Note-`fulltext` (falls vorhanden). + 3) Optional: Vergleicht zusätzlich mit dem Body der zugehörigen Markdown-Datei im Vault. + +Output +------ +- Pro Note eine JSON-Zeile mit: + note_id, chunks_count, chunks_with_text, match_fulltext, match_vault (optional), issues +- Zum Schluss eine Summary mit Counts und Exit-Code 1 bei Fehlern (falls --fail-on-mismatch gesetzt). + +Aufrufe +------- + # Nur gegen Qdrant (fulltext vs. Chunks) + python3 tests/verify_chunks_integrity.py --prefix mindnet + + # Zusätzlich gegen den Vault abgleichen (Body der .md-Datei) + python3 tests/verify_chunks_integrity.py --prefix mindnet --vault ./test_vault + + # Whitespace-tolerant (Default): Trim/Normalisierung ein + python3 tests/verify_chunks_integrity.py --prefix mindnet --vault ./test_vault + + # Strikt (kein Trimmen): + python3 tests/verify_chunks_integrity.py --prefix mindnet --strict --fail-on-mismatch +""" +from __future__ import annotations + +import argparse +import json +import os +from typing import Any, Dict, List, Optional, Tuple + +import yaml +from qdrant_client.http import models as rest + +from app.core.qdrant import QdrantConfig, get_client + +# --------------------------- Helpers --------------------------- + +def collections(prefix: str) -> Tuple[str, str, str]: + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + +def scroll_all(client, collection: str, flt: Optional[rest.Filter] = None, limit: int = 256) -> List[Any]: + out: List[Any] = [] + nextp = None + while True: + pts, nextp = client.scroll( + collection_name=collection, + with_payload=True, + with_vectors=False, + limit=limit, + offset=nextp, + scroll_filter=flt, + ) + if not pts: + break + out.extend(pts) + if nextp is None: + break + return out + +def sort_key_for_chunk_payload(pl: Dict[str, Any]) -> Tuple[int, int, int]: + s = pl.get("seq") or 0 + ci = pl.get("chunk_index") or 0 + n = 0 + cid = pl.get("chunk_id") or "" + if isinstance(cid, str) and "#" in cid: + try: + n = int(cid.rsplit("#", 1)[-1]) + except Exception: + n = 0 + return (int(s), int(ci), int(n)) + +def reconstruct_from_chunks(chunks_points: List[Any]) -> Tuple[str, int, int]: + """Gibt (text, total_chunks, chunks_with_text) zurück.""" + chunks_sorted = sorted(chunks_points, key=lambda p: sort_key_for_chunk_payload(p.payload or {})) + texts: List[str] = [] + have = 0 + for p in chunks_sorted: + pl = p.payload or {} + t = pl.get("text") or pl.get("content") or pl.get("raw") or "" + if isinstance(t, str) and t: + have += 1 + texts.append(t) + return ("\n".join(texts).strip(), len(chunks_sorted), have) + +def normalize(s: str, strict: bool = False) -> str: + if s is None: + return "" + s = s.replace("\r\n", "\n").replace("\r", "\n") + if strict: + return s + # toleranter Vergleich: trim trailing WS, entferne überzählige Leerzeilen am Rand + lines = [ln.rstrip() for ln in s.strip().split("\n")] + return "\n".join(lines).strip() + +def read_vault_body(vault_root: str, rel_path: str) -> Optional[str]: + if not rel_path: + return None + p = os.path.join(vault_root, rel_path.replace("\\", "/").lstrip("/")) + if not os.path.exists(p): + return None + with open(p, "r", encoding="utf-8") as f: + s = f.read() + if s.startswith("---"): + try: + fm_txt, body = s.split("\n---\n", 1) + except Exception: + return s + else: + body = s + return body + +# --------------------------- Main ------------------------------ + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)") + ap.add_argument("--vault", help="Optional: Vault-Wurzelordner für Abgleich gegen .md") + ap.add_argument("--strict", action="store_true", help="Strikter Vergleich (kein Trimmen/Normalisieren)") + ap.add_argument("--fail-on-mismatch", action="store_true", help="Exit 1, wenn ein Mismatch gefunden wurde") + args = ap.parse_args() + + cfg = QdrantConfig.from_env() + if args.prefix: + cfg.prefix = args.prefix.strip() + client = get_client(cfg) + + notes_col, chunks_col, _ = collections(cfg.prefix) + notes = scroll_all(client, notes_col) + + mismatches = 0 + total = 0 + + for n in notes: + total += 1 + pl = n.payload or {} + nid = pl.get("note_id") + rel_path = (pl.get("path") or "").replace("\\", "/").lstrip("/") + fulltext = (pl.get("fulltext") or "").strip() + + flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) + chunks = scroll_all(client, chunks_col, flt) + recon, cnt, have = reconstruct_from_chunks(chunks) + + norm_recon = normalize(recon, strict=args.strict) + norm_full = normalize(fulltext, strict=args.strict) + + match_full = (bool(norm_full) and (norm_recon == norm_full)) + match_vault = None + + issues: List[str] = [] + if cnt == 0: + issues.append("no_chunks") + if have < cnt: + issues.append(f"missing_text_in_chunks:{cnt-have}/{cnt}") + if norm_full == "": + issues.append("note_fulltext_empty") + if not match_full: + issues.append("reconstructed_vs_fulltext_mismatch") + + if args.vault: + body = read_vault_body(args.vault, rel_path) or "" + norm_body = normalize(body, strict=args.strict) + match_vault = (norm_body == norm_recon) + if not match_vault: + issues.append("reconstructed_vs_vault_body_mismatch") + + obj = { + "note_id": nid, + "path": rel_path, + "chunks_count": cnt, + "chunks_with_text": have, + "match_fulltext": match_full, + "match_vault": match_vault, + "issues": issues, + } + print(json.dumps(obj, ensure_ascii=False)) + if ("reconstructed_vs_fulltext_mismatch" in issues) or ("reconstructed_vs_vault_body_mismatch" in issues): + mismatches += 1 + + summary = {"summary": "OK" if mismatches == 0 else "DIFFS", "mismatch_count": mismatches, "notes": total} + print(json.dumps(summary, ensure_ascii=False)) + if args.fail_on_mismatch and mismatches: + raise SystemExit(1) + +if __name__ == "__main__": + main()