mindnet/scripts/verify_chunk_texts.py
Lars e93bab6ea7
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
Fassadenauflösung unter app/core
2025-12-28 11:04:40 +01:00

203 lines
6.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/verify_chunk_texts.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
Zweck:
-------
Verifiziert, dass Chunk-Texte in Qdrant korrekt gespeichert sind und
der Note-Body aus Chunks rekonstruiert werden kann.
Funktionsweise:
---------------
1. Lädt alle Notes aus {prefix}_notes
2. Für jede Note:
- Lädt zugehörige Chunks
- Prüft, dass alle Chunks Text enthalten (text/content/raw)
- Rekonstruiert Body aus Chunks (sortiert nach chunk_index/chunk_id)
- Berechnet Coverage: gefundene Textsegmente / Fulltext-Länge
3. Validiert gegen notes.payload.fulltext
Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON mit Validierungs-Ergebnissen
* summary: Gesamtstatistiken
* per_note: Details pro Note
* coverage: Durchschnittliche Coverage
- Exit-Code 0: Alle Prüfungen bestanden
- Exit-Code 1: Probleme gefunden
Verwendung:
-----------
- Validierung nach Importen
- Diagnose von Text-Rekonstruktionsproblemen
- Qualitätskontrolle
Hinweise:
---------
- Coverage-Toleranz für Overlaps (Standard: >= 0.90)
- Prüft Text-Felder: text (bevorzugt), content, raw
Aufruf:
-------
python3 -m scripts.verify_chunk_texts --prefix mindnet
python3 -m scripts.verify_chunk_texts --note-id concept-alpha --min-coverage 0.95
Parameter:
----------
--prefix TEXT Collection-Präfix (Default: ENV COLLECTION_PREFIX)
--note-id ID Nur eine bestimmte Note prüfen (optional)
--min-coverage F Minimale Coverage (Default: 0.90)
Umgebungsvariablen:
-------------------
QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY, COLLECTION_PREFIX
Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0 (2025-09-09): Initial Release
"""
from __future__ import annotations
import argparse, json, os, re, sys
from typing import Dict, List, Tuple, Optional
from qdrant_client.http import models as rest
from app.core.database.qdrant import QdrantConfig, get_client, collection_names
def _chunk_sort_key(p: Dict, pid: str) -> Tuple[int,int,str]:
# Primär: payload.chunk_index, sekundär: Nummer am Ende der ID (#cNN oder #NN), sonst 0
ci = p.get("chunk_index")
n = 0
m = re.search(r'#c?(\d+)$', pid or "")
if m:
try:
n = int(m.group(1))
except Exception:
n = 0
return (ci if isinstance(ci, int) else 1_000_000 + n, n, pid)
def _choose_text(payload: Dict) -> Optional[str]:
for k in ("text", "content", "raw"):
v = payload.get(k)
if isinstance(v, str) and v.strip():
return v
return None
def _coverage(fulltext: str, pieces: List[str]) -> float:
"""Berechnet die Abdeckungsquote der Stücke im Fulltext (sequenzielles Matching)."""
if not fulltext:
return 0.0 if pieces else 1.0
cursor = 0
covered = 0
ft = fulltext
for piece in pieces:
if not piece:
continue
# Tolerant gegen Whitespace-Unterschiede: normalisieren nur \r\n→\n
p = piece.replace("\r\n", "\n").replace("\r", "\n")
idx = ft.find(p, cursor)
if idx == -1:
# Versuche ein paar Heuristiken: trimmen
p2 = p.strip()
if p2 and len(p2) > 8:
idx = ft.find(p2, cursor)
if idx != -1:
covered += len(p)
cursor = idx + len(p)
# sonst: nicht abgedeckt
return covered / max(1, len(ft))
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--prefix", help="Collection-Prefix (Default: ENV COLLECTION_PREFIX oder 'mindnet')")
ap.add_argument("--note-id", help="Nur eine bestimmte Note prüfen")
ap.add_argument("--min-coverage", type=float, default=0.90, help="Mindestabdeckung durch Chunks (Default: 0.90)")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix
notes_col, chunks_col, _ = collection_names(cfg.prefix)
cli = get_client(cfg)
# Notes abrufen (optional filter by note_id)
notes_filter = None
if args.note_id:
notes_filter = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))])
notes: List[Dict] = []
off = None
while True:
pts, off = cli.scroll(collection_name=notes_col, scroll_filter=notes_filter,
with_payload=True, with_vectors=False, limit=256, offset=off)
if not pts:
break
for p in pts:
notes.append({"id": p.id, "payload": p.payload or {}})
if off is None:
break
results: List[Dict] = []
total_missing_text = 0
total_notes_ok = 0
for n in notes:
pl = n["payload"]
nid = pl.get("note_id") or pl.get("id") or n.get("id")
fulltext = pl.get("fulltext") or ""
# Chunks der Note holen
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
chunks = []
off = None
while True:
pts, off = cli.scroll(collection_name=chunks_col, scroll_filter=f,
with_payload=True, with_vectors=False, limit=256, offset=off)
if not pts:
break
for p in pts:
chunks.append({"id": p.id, "payload": p.payload or {}})
if off is None:
break
# sortieren
chunks.sort(key=lambda c: _chunk_sort_key(c["payload"], c["id"]))
texts = []
missing_text = 0
for c in chunks:
t = _choose_text(c["payload"])
if t is None:
missing_text += 1
texts.append("")
else:
texts.append(t)
cov = _coverage(fulltext, texts)
ok = (missing_text == 0) and (cov >= args.min_coverage or not fulltext)
if ok:
total_notes_ok += 1
total_missing_text += missing_text
results.append({
"note_id": nid,
"title": pl.get("title"),
"chunks": len(chunks),
"missing_chunk_texts": missing_text,
"coverage": round(cov, 4),
"has_fulltext": bool(fulltext),
"ok": ok
})
out = {
"collections": {"notes": notes_col, "chunks": chunks_col},
"notes_checked": len(notes),
"notes_ok": total_notes_ok,
"total_missing_chunk_texts": total_missing_text,
"min_coverage": args.min_coverage,
"details": results
}
print(json.dumps(out, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()