#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ tests/check_chunks_window_vs_text.py Prüft pro Note und global: - ob window == text (sollte bei Overlap normalerweise NEIN sein) - ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window) - Größen & einfache Statistiken Nutzung: python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault] """ from __future__ import annotations import os, sys, json, argparse from typing import List, Dict, Any, Tuple, Optional PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) from app.core.qdrant import QdrantConfig, get_client from qdrant_client.http import models as rest def collections(prefix: str) -> Tuple[str, str, str]: return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]: f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) out = [] next_page = None while True: pts, next_page = client.scroll( collection_name=chunks_col, with_payload=True, with_vectors=False, limit=256, offset=next_page, scroll_filter=f, ) if not pts: break out.extend([p.payload or {} for p in pts]) if next_page is None: break # sortieren: seq -> chunk_index -> chunk_id def keyer(pl): return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", ""))) return sorted(out, key=keyer) def list_note_ids(client, notes_col: str) -> List[str]: out = [] next_page = None while True: pts, next_page = client.scroll( collection_name=notes_col, with_payload=True, with_vectors=False, limit=256, offset=next_page, ) if not pts: break for p in pts: pl = p.payload or {} nid = pl.get("note_id") if isinstance(nid, str): out.append(nid) if next_page is None: break return sorted(set(out)) def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int: """ Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt). """ n = min(len(a_suffix), len(b_prefix), max_probe) if n <= 0: return 0 a = a_suffix[-n:] b = b_prefix[:n] # steigere den Match von 0..n for k in range(n, 0, -1): if a[-k:] == b[:k]: return k return 0 def main(): ap = argparse.ArgumentParser() ap.add_argument("--prefix", required=True) ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben") args = ap.parse_args() cfg = QdrantConfig.from_env() cfg.prefix = args.prefix client = get_client(cfg) notes_col, chunks_col, _ = collections(cfg.prefix) note_ids = list_note_ids(client, notes_col) if not note_ids: print(json.dumps({"error": "no notes found in qdrant for this prefix"})) return global_total = 0 global_ident = 0 global_left_ov_sum = 0 detail = [] for nid in note_ids: chunks = scroll_chunks_by_note(client, chunks_col, nid) if not chunks: detail.append({"note_id": nid, "chunks": 0}) continue ident = 0 left_ov_sum = 0 for i, ch in enumerate(chunks): txt = (ch.get("text") or "").replace("\r\n", "\n") win = (ch.get("window") or "").replace("\r\n", "\n") if txt == win: ident += 1 if i > 0: prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n") left_ov = common_overlap_len(prev_txt, win, max_probe=1024) left_ov_sum += left_ov global_total += len(chunks) global_ident += ident global_left_ov_sum += left_ov_sum detail.append({ "note_id": nid, "chunks": len(chunks), "identical_text_window": ident, "identical_share": round(ident / max(1, len(chunks)), 3), "avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0 }) out = { "notes": len(note_ids), "chunks_total": global_total, "identical_total": global_ident, "identical_share_global": round(global_ident / max(1, global_total), 3), "avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0, "details": detail[:100] # capped } print(json.dumps(out, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()