All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
147 lines
4.8 KiB
Python
147 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
tests/check_chunks_window_vs_text.py
|
|
Prüft pro Note und global:
|
|
- ob window == text (sollte bei Overlap normalerweise NEIN sein)
|
|
- ungefähre linke Overlap-Länge (Suffix von prev.text vs. Prefix von cur.window)
|
|
- Größen & einfache Statistiken
|
|
|
|
Nutzung:
|
|
python3 tests/check_chunks_window_vs_text.py --prefix mindnet [--vault ./test_vault]
|
|
"""
|
|
from __future__ import annotations
|
|
import os, sys, json, argparse
|
|
from typing import List, Dict, Any, Tuple, Optional
|
|
|
|
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
if PROJECT_ROOT not in sys.path:
|
|
sys.path.insert(0, PROJECT_ROOT)
|
|
|
|
from app.core.qdrant import QdrantConfig, get_client
|
|
from qdrant_client.http import models as rest
|
|
|
|
def collections(prefix: str) -> Tuple[str, str, str]:
|
|
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
|
|
|
def scroll_chunks_by_note(client, chunks_col: str, note_id: str) -> List[Dict[str, Any]]:
|
|
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
|
out = []
|
|
next_page = None
|
|
while True:
|
|
pts, next_page = client.scroll(
|
|
collection_name=chunks_col,
|
|
with_payload=True,
|
|
with_vectors=False,
|
|
limit=256,
|
|
offset=next_page,
|
|
scroll_filter=f,
|
|
)
|
|
if not pts:
|
|
break
|
|
out.extend([p.payload or {} for p in pts])
|
|
if next_page is None:
|
|
break
|
|
# sortieren: seq -> chunk_index -> chunk_id
|
|
def keyer(pl):
|
|
return (pl.get("seq", 1<<30), pl.get("chunk_index", 1<<30), str(pl.get("chunk_id", "")))
|
|
return sorted(out, key=keyer)
|
|
|
|
def list_note_ids(client, notes_col: str) -> List[str]:
|
|
out = []
|
|
next_page = None
|
|
while True:
|
|
pts, next_page = client.scroll(
|
|
collection_name=notes_col,
|
|
with_payload=True,
|
|
with_vectors=False,
|
|
limit=256,
|
|
offset=next_page,
|
|
)
|
|
if not pts:
|
|
break
|
|
for p in pts:
|
|
pl = p.payload or {}
|
|
nid = pl.get("note_id")
|
|
if isinstance(nid, str):
|
|
out.append(nid)
|
|
if next_page is None:
|
|
break
|
|
return sorted(set(out))
|
|
|
|
def common_overlap_len(a_suffix: str, b_prefix: str, max_probe: int = 256) -> int:
|
|
"""
|
|
Länge des längsten Suffix von a_suffix, das Prefix von b_prefix ist (bruteforce, begrenzt).
|
|
"""
|
|
n = min(len(a_suffix), len(b_prefix), max_probe)
|
|
if n <= 0:
|
|
return 0
|
|
a = a_suffix[-n:]
|
|
b = b_prefix[:n]
|
|
# steigere den Match von 0..n
|
|
for k in range(n, 0, -1):
|
|
if a[-k:] == b[:k]:
|
|
return k
|
|
return 0
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--prefix", required=True)
|
|
ap.add_argument("--vault", help="optional, nur für Vergleichsausgaben")
|
|
args = ap.parse_args()
|
|
|
|
cfg = QdrantConfig.from_env()
|
|
cfg.prefix = args.prefix
|
|
client = get_client(cfg)
|
|
|
|
notes_col, chunks_col, _ = collections(cfg.prefix)
|
|
note_ids = list_note_ids(client, notes_col)
|
|
if not note_ids:
|
|
print(json.dumps({"error": "no notes found in qdrant for this prefix"}))
|
|
return
|
|
|
|
global_total = 0
|
|
global_ident = 0
|
|
global_left_ov_sum = 0
|
|
detail = []
|
|
|
|
for nid in note_ids:
|
|
chunks = scroll_chunks_by_note(client, chunks_col, nid)
|
|
if not chunks:
|
|
detail.append({"note_id": nid, "chunks": 0})
|
|
continue
|
|
ident = 0
|
|
left_ov_sum = 0
|
|
for i, ch in enumerate(chunks):
|
|
txt = (ch.get("text") or "").replace("\r\n", "\n")
|
|
win = (ch.get("window") or "").replace("\r\n", "\n")
|
|
if txt == win:
|
|
ident += 1
|
|
if i > 0:
|
|
prev_txt = (chunks[i-1].get("text") or "").replace("\r\n", "\n")
|
|
left_ov = common_overlap_len(prev_txt, win, max_probe=1024)
|
|
left_ov_sum += left_ov
|
|
global_total += len(chunks)
|
|
global_ident += ident
|
|
global_left_ov_sum += left_ov_sum
|
|
detail.append({
|
|
"note_id": nid,
|
|
"chunks": len(chunks),
|
|
"identical_text_window": ident,
|
|
"identical_share": round(ident / max(1, len(chunks)), 3),
|
|
"avg_left_overlap_est": round(left_ov_sum / max(1, len(chunks)-1), 2) if len(chunks) > 1 else 0.0
|
|
})
|
|
|
|
out = {
|
|
"notes": len(note_ids),
|
|
"chunks_total": global_total,
|
|
"identical_total": global_ident,
|
|
"identical_share_global": round(global_ident / max(1, global_total), 3),
|
|
"avg_left_overlap_est_global": round(global_left_ov_sum / max(1, global_total - len(note_ids)), 2) if global_total > len(note_ids) else 0.0,
|
|
"details": detail[:100] # capped
|
|
}
|
|
print(json.dumps(out, ensure_ascii=False, indent=2))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|