mindnet/tests/check_chunks_vs_text.py
Lars d27324f8dd
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
tests/check_chunks_vs_text.py hinzugefügt
2025-10-01 15:23:08 +02:00

147 lines
4.8 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
tests/check_chunks_window_vs_text.py
Prüft pro Note und global:
- ob window == text (sollte bei Overlap normalerweise NEIN sein)
- ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window)
- Größen & einfache Statistiken
Nutzung:
python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault]
"""
from __future__ import annotations
import os, sys, json, argparse
from typing import List, Dict, Any, Tuple, Optional
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from app.core.qdrant import QdrantConfig, get_client
from qdrant_client.http import models as rest
def collections(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]:
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
out = []
next_page = None
while True:
pts, next_page = client.scroll(
collection_name=chunks_col,
with_payload=True,
with_vectors=False,
limit=256,
offset=next_page,
scroll_filter=f,
)
if not pts:
break
out.extend([p.payload or {} for p in pts])
if next_page is None:
break
# sortieren: seq -> chunk_index -> chunk_id
def keyer(pl):
return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", "")))
return sorted(out, key=keyer)
def list_note_ids(client, notes_col: str) -> List[str]:
out = []
next_page = None
while True:
pts, next_page = client.scroll(
collection_name=notes_col,
with_payload=True,
with_vectors=False,
limit=256,
offset=next_page,
)
if not pts:
break
for p in pts:
pl = p.payload or {}
nid = pl.get("note_id")
if isinstance(nid, str):
out.append(nid)
if next_page is None:
break
return sorted(set(out))
def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int:
"""
Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt).
"""
n = min(len(a_suffix), len(b_prefix), max_probe)
if n <= 0:
return 0
a = a_suffix[-n:]
b = b_prefix[:n]
# steigere den Match von 0..n
for k in range(n, 0, -1):
if a[-k:] == b[:k]:
return k
return 0
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--prefix", required=True)
ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
cfg.prefix = args.prefix
client = get_client(cfg)
notes_col, chunks_col, _ = collections(cfg.prefix)
note_ids = list_note_ids(client, notes_col)
if not note_ids:
print(json.dumps({"error": "no notes found in qdrant for this prefix"}))
return
global_total = 0
global_ident = 0
global_left_ov_sum = 0
detail = []
for nid in note_ids:
chunks = scroll_chunks_by_note(client, chunks_col, nid)
if not chunks:
detail.append({"note_id": nid, "chunks": 0})
continue
ident = 0
left_ov_sum = 0
for i, ch in enumerate(chunks):
txt = (ch.get("text") or "").replace("\r\n", "\n")
win = (ch.get("window") or "").replace("\r\n", "\n")
if txt == win:
ident += 1
if i > 0:
prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n")
left_ov = common_overlap_len(prev_txt, win, max_probe=1024)
left_ov_sum += left_ov
global_total += len(chunks)
global_ident += ident
global_left_ov_sum += left_ov_sum
detail.append({
"note_id": nid,
"chunks": len(chunks),
"identical_text_window": ident,
"identical_share": round(ident / max(1, len(chunks)), 3),
"avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0
})
out = {
"notes": len(note_ids),
"chunks_total": global_total,
"identical_total": global_ident,
"identical_share_global": round(global_ident / max(1, global_total), 3),
"avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0,
"details": detail[:100] # capped
}
print(json.dumps(out, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()