mindnet/tests/check_chunks_vs_text.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
tests/check_chunks_window_vs_text.py
Prüft pro Note und global:
  - ob window == text (sollte bei Overlap normalerweise NEIN sein)
  - ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window)
  - Größen & einfache Statistiken

Nutzung:
  python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault]
"""
from __future__ import annotations
import os, sys, json, argparse
from typing import List, Dict, Any, Tuple, Optional

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from app.core.qdrant import QdrantConfig, get_client
from qdrant_client.http import models as rest

def collections(prefix: str) -> Tuple[str, str, str]:
    return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"

def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]:
    f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
    out = []
    next_page = None
    while True:
        pts, next_page = client.scroll(
            collection_name=chunks_col,
            with_payload=True,
            with_vectors=False,
            limit=256,
            offset=next_page,
            scroll_filter=f,
        )
        if not pts:
            break
        out.extend([p.payload or {} for p in pts])
        if next_page is None:
            break
    # sortieren: seq -> chunk_index -> chunk_id
    def keyer(pl):
        return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", "")))
    return sorted(out, key=keyer)

def list_note_ids(client, notes_col: str) -> List[str]:
    out = []
    next_page = None
    while True:
        pts, next_page = client.scroll(
            collection_name=notes_col,
            with_payload=True,
            with_vectors=False,
            limit=256,
            offset=next_page,
        )
        if not pts:
            break
        for p in pts:
            pl = p.payload or {}
            nid = pl.get("note_id")
            if isinstance(nid, str):
                out.append(nid)
        if next_page is None:
            break
    return sorted(set(out))

def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int:
    """
    Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt).
    """
    n = min(len(a_suffix), len(b_prefix), max_probe)
    if n <= 0:
        return 0
    a = a_suffix[-n:]
    b = b_prefix[:n]
    # steigere den Match von 0..n
    for k in range(n, 0, -1):
        if a[-k:] == b[:k]:
            return k
    return 0

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--prefix", required=True)
    ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben")
    args = ap.parse_args()

    cfg = QdrantConfig.from_env()
    cfg.prefix = args.prefix
    client = get_client(cfg)

    notes_col, chunks_col, _ = collections(cfg.prefix)
    note_ids = list_note_ids(client, notes_col)
    if not note_ids:
        print(json.dumps({"error": "no notes found in qdrant for this prefix"}))
        return

    global_total = 0
    global_ident = 0
    global_left_ov_sum = 0
    detail = []

    for nid in note_ids:
        chunks = scroll_chunks_by_note(client, chunks_col, nid)
        if not chunks:
            detail.append({"note_id": nid, "chunks": 0})
            continue
        ident = 0
        left_ov_sum = 0
        for i, ch in enumerate(chunks):
            txt = (ch.get("text") or "").replace("\r\n", "\n")
            win = (ch.get("window") or "").replace("\r\n", "\n")
            if txt == win:
                ident += 1
            if i > 0:
                prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n")
                left_ov = common_overlap_len(prev_txt, win, max_probe=1024)
                left_ov_sum += left_ov
        global_total += len(chunks)
        global_ident += ident
        global_left_ov_sum += left_ov_sum
        detail.append({
            "note_id": nid,
            "chunks": len(chunks),
            "identical_text_window": ident,
            "identical_share": round(ident / max(1, len(chunks)), 3),
            "avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0
        })

    out = {
        "notes": len(note_ids),
        "chunks_total": global_total,
        "identical_total": global_ident,
        "identical_share_global": round(global_ident / max(1, global_total), 3),
        "avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0,
        "details": detail[:100]  # capped
    }
    print(json.dumps(out, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()