tests/check_chunks_window_vs_text.py hinzugefügt
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s
This commit is contained in:
parent
d27324f8dd
commit
4fcf22131c
146
tests/check_chunks_window_vs_text.py
Normal file
146
tests/check_chunks_window_vs_text.py
Normal file
|
|
@ -0,0 +1,146 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
tests/check_chunks_window_vs_text.py
|
||||
Prüft pro Note und global:
|
||||
- ob window == text (sollte bei Overlap normalerweise NEIN sein)
|
||||
- ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window)
|
||||
- Größen & einfache Statistiken
|
||||
|
||||
Nutzung:
|
||||
python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os, sys, json, argparse
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from app.core.qdrant import QdrantConfig, get_client
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
def collections(prefix: str) -> Tuple[str, str, str]:
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
out = []
|
||||
next_page = None
|
||||
while True:
|
||||
pts, next_page = client.scroll(
|
||||
collection_name=chunks_col,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
limit=256,
|
||||
offset=next_page,
|
||||
scroll_filter=f,
|
||||
)
|
||||
if not pts:
|
||||
break
|
||||
out.extend([p.payload or {} for p in pts])
|
||||
if next_page is None:
|
||||
break
|
||||
# sortieren: seq -> chunk_index -> chunk_id
|
||||
def keyer(pl):
|
||||
return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", "")))
|
||||
return sorted(out, key=keyer)
|
||||
|
||||
def list_note_ids(client, notes_col: str) -> List[str]:
|
||||
out = []
|
||||
next_page = None
|
||||
while True:
|
||||
pts, next_page = client.scroll(
|
||||
collection_name=notes_col,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
limit=256,
|
||||
offset=next_page,
|
||||
)
|
||||
if not pts:
|
||||
break
|
||||
for p in pts:
|
||||
pl = p.payload or {}
|
||||
nid = pl.get("note_id")
|
||||
if isinstance(nid, str):
|
||||
out.append(nid)
|
||||
if next_page is None:
|
||||
break
|
||||
return sorted(set(out))
|
||||
|
||||
def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int:
|
||||
"""
|
||||
Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt).
|
||||
"""
|
||||
n = min(len(a_suffix), len(b_prefix), max_probe)
|
||||
if n <= 0:
|
||||
return 0
|
||||
a = a_suffix[-n:]
|
||||
b = b_prefix[:n]
|
||||
# steigere den Match von 0..n
|
||||
for k in range(n, 0, -1):
|
||||
if a[-k:] == b[:k]:
|
||||
return k
|
||||
return 0
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--prefix", required=True)
|
||||
ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben")
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = QdrantConfig.from_env()
|
||||
cfg.prefix = args.prefix
|
||||
client = get_client(cfg)
|
||||
|
||||
notes_col, chunks_col, _ = collections(cfg.prefix)
|
||||
note_ids = list_note_ids(client, notes_col)
|
||||
if not note_ids:
|
||||
print(json.dumps({"error": "no notes found in qdrant for this prefix"}))
|
||||
return
|
||||
|
||||
global_total = 0
|
||||
global_ident = 0
|
||||
global_left_ov_sum = 0
|
||||
detail = []
|
||||
|
||||
for nid in note_ids:
|
||||
chunks = scroll_chunks_by_note(client, chunks_col, nid)
|
||||
if not chunks:
|
||||
detail.append({"note_id": nid, "chunks": 0})
|
||||
continue
|
||||
ident = 0
|
||||
left_ov_sum = 0
|
||||
for i, ch in enumerate(chunks):
|
||||
txt = (ch.get("text") or "").replace("\r\n", "\n")
|
||||
win = (ch.get("window") or "").replace("\r\n", "\n")
|
||||
if txt == win:
|
||||
ident += 1
|
||||
if i > 0:
|
||||
prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n")
|
||||
left_ov = common_overlap_len(prev_txt, win, max_probe=1024)
|
||||
left_ov_sum += left_ov
|
||||
global_total += len(chunks)
|
||||
global_ident += ident
|
||||
global_left_ov_sum += left_ov_sum
|
||||
detail.append({
|
||||
"note_id": nid,
|
||||
"chunks": len(chunks),
|
||||
"identical_text_window": ident,
|
||||
"identical_share": round(ident / max(1, len(chunks)), 3),
|
||||
"avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0
|
||||
})
|
||||
|
||||
out = {
|
||||
"notes": len(note_ids),
|
||||
"chunks_total": global_total,
|
||||
"identical_total": global_ident,
|
||||
"identical_share_global": round(global_ident / max(1, global_total), 3),
|
||||
"avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0,
|
||||
"details": detail[:100] # capped
|
||||
}
|
||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user