tests/check_chunks_window_vs_text.py hinzugefügt

2025-10-01 15:25:17 +02:00 · 2025-10-01 15:25:17 +02:00 · 4fcf22131c
commit 4fcf22131c
parent d27324f8dd
1 changed files with 146 additions and 0 deletions
--- a/tests/check_chunks_window_vs_text.py
+++ b/tests/check_chunks_window_vs_text.py
@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+tests/check_chunks_window_vs_text.py
+Prüft pro Note und global:
+  - ob window == text (sollte bei Overlap normalerweise NEIN sein)
+  - ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window)
+  - Größen & einfache Statistiken
+
+Nutzung:
+  python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault]
+"""
+from __future__ import annotations
+import os, sys, json, argparse
+from typing import List, Dict, Any, Tuple, Optional
+
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+from app.core.qdrant import QdrantConfig, get_client
+from qdrant_client.http import models as rest
+
+def collections(prefix: str) -> Tuple[str, str, str]:
+    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
+
+def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]:
+    f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
+    out = []
+    next_page = None
+    while True:
+        pts, next_page = client.scroll(
+            collection_name=chunks_col,
+            with_payload=True,
+            with_vectors=False,
+            limit=256,
+            offset=next_page,
+            scroll_filter=f,
+        )
+        if not pts:
+            break
+        out.extend([p.payload or {} for p in pts])
+        if next_page is None:
+            break
+    # sortieren: seq -> chunk_index -> chunk_id
+    def keyer(pl):
+        return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", "")))
+    return sorted(out, key=keyer)
+
+def list_note_ids(client, notes_col: str) -> List[str]:
+    out = []
+    next_page = None
+    while True:
+        pts, next_page = client.scroll(
+            collection_name=notes_col,
+            with_payload=True,
+            with_vectors=False,
+            limit=256,
+            offset=next_page,
+        )
+        if not pts:
+            break
+        for p in pts:
+            pl = p.payload or {}
+            nid = pl.get("note_id")
+            if isinstance(nid, str):
+                out.append(nid)
+        if next_page is None:
+            break
+    return sorted(set(out))
+
+def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int:
+    """
+    Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt).
+    """
+    n = min(len(a_suffix), len(b_prefix), max_probe)
+    if n <= 0:
+        return 0
+    a = a_suffix[-n:]
+    b = b_prefix[:n]
+    # steigere den Match von 0..n
+    for k in range(n, 0, -1):
+        if a[-k:] == b[:k]:
+            return k
+    return 0
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--prefix", required=True)
+    ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben")
+    args = ap.parse_args()
+
+    cfg = QdrantConfig.from_env()
+    cfg.prefix = args.prefix
+    client = get_client(cfg)
+
+    notes_col, chunks_col, _ = collections(cfg.prefix)
+    note_ids = list_note_ids(client, notes_col)
+    if not note_ids:
+        print(json.dumps({"error": "no notes found in qdrant for this prefix"}))
+        return
+
+    global_total = 0
+    global_ident = 0
+    global_left_ov_sum = 0
+    detail = []
+
+    for nid in note_ids:
+        chunks = scroll_chunks_by_note(client, chunks_col, nid)
+        if not chunks:
+            detail.append({"note_id": nid, "chunks": 0})
+            continue
+        ident = 0
+        left_ov_sum = 0
+        for i, ch in enumerate(chunks):
+            txt = (ch.get("text") or "").replace("\r\n", "\n")
+            win = (ch.get("window") or "").replace("\r\n", "\n")
+            if txt == win:
+                ident += 1
+            if i > 0:
+                prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n")
+                left_ov = common_overlap_len(prev_txt, win, max_probe=1024)
+                left_ov_sum += left_ov
+        global_total += len(chunks)
+        global_ident += ident
+        global_left_ov_sum += left_ov_sum
+        detail.append({
+            "note_id": nid,
+            "chunks": len(chunks),
+            "identical_text_window": ident,
+            "identical_share": round(ident / max(1, len(chunks)), 3),
+            "avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0
+        })
+
+    out = {
+        "notes": len(note_ids),
+        "chunks_total": global_total,
+        "identical_total": global_ident,
+        "identical_share_global": round(global_ident / max(1, global_total), 3),
+        "avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0,
+        "details": detail[:100]  # capped
+    }
+    print(json.dumps(out, ensure_ascii=False, indent=2))
+
+if __name__ == "__main__":
+    main()