diff --git a/tests/check_chunks_vs_text.py b/tests/check_chunks_vs_text.py new file mode 100644 index 0000000..11959a5 --- /dev/null +++ b/tests/check_chunks_vs_text.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +tests/check_chunks_window_vs_text.py +Prüft pro Note und global: + - ob window == text (sollte bei Overlap normalerweise NEIN sein) + - ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window) + - Größen & einfache Statistiken + +Nutzung: + python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault] +""" +from __future__ import annotations +import os, sys, json, argparse +from typing import List, Dict, Any, Tuple, Optional + +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +from app.core.qdrant import QdrantConfig, get_client +from qdrant_client.http import models as rest + +def collections(prefix: str) -> Tuple[str, str, str]: + return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" + +def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]: + f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + out = [] + next_page = None + while True: + pts, next_page = client.scroll( + collection_name=chunks_col, + with_payload=True, + with_vectors=False, + limit=256, + offset=next_page, + scroll_filter=f, + ) + if not pts: + break + out.extend([p.payload or {} for p in pts]) + if next_page is None: + break + # sortieren: seq -> chunk_index -> chunk_id + def keyer(pl): + return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", ""))) + return sorted(out, key=keyer) + +def list_note_ids(client, notes_col: str) -> List[str]: + out = [] + next_page = None + while True: + pts, next_page = client.scroll( + collection_name=notes_col, + with_payload=True, + with_vectors=False, + limit=256, + offset=next_page, + ) + if not pts: + break + for p in pts: + pl = p.payload or {} + nid = pl.get("note_id") + if isinstance(nid, str): + out.append(nid) + if next_page is None: + break + return sorted(set(out)) + +def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int: + """ + Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt). + """ + n = min(len(a_suffix), len(b_prefix), max_probe) + if n <= 0: + return 0 + a = a_suffix[-n:] + b = b_prefix[:n] + # steigere den Match von 0..n + for k in range(n, 0, -1): + if a[-k:] == b[:k]: + return k + return 0 + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--prefix", required=True) + ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben") + args = ap.parse_args() + + cfg = QdrantConfig.from_env() + cfg.prefix = args.prefix + client = get_client(cfg) + + notes_col, chunks_col, _ = collections(cfg.prefix) + note_ids = list_note_ids(client, notes_col) + if not note_ids: + print(json.dumps({"error": "no notes found in qdrant for this prefix"})) + return + + global_total = 0 + global_ident = 0 + global_left_ov_sum = 0 + detail = [] + + for nid in note_ids: + chunks = scroll_chunks_by_note(client, chunks_col, nid) + if not chunks: + detail.append({"note_id": nid, "chunks": 0}) + continue + ident = 0 + left_ov_sum = 0 + for i, ch in enumerate(chunks): + txt = (ch.get("text") or "").replace("\r\n", "\n") + win = (ch.get("window") or "").replace("\r\n", "\n") + if txt == win: + ident += 1 + if i > 0: + prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n") + left_ov = common_overlap_len(prev_txt, win, max_probe=1024) + left_ov_sum += left_ov + global_total += len(chunks) + global_ident += ident + global_left_ov_sum += left_ov_sum + detail.append({ + "note_id": nid, + "chunks": len(chunks), + "identical_text_window": ident, + "identical_share": round(ident / max(1, len(chunks)), 3), + "avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0 + }) + + out = { + "notes": len(note_ids), + "chunks_total": global_total, + "identical_total": global_ident, + "identical_share_global": round(global_ident / max(1, global_total), 3), + "avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0, + "details": detail[:100] # capped + } + print(json.dumps(out, ensure_ascii=False, indent=2)) + +if __name__ == "__main__": + main()