mindnet/scripts/verify_chunk_texts.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/verify_chunk_texts.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Verifiziert, dass Chunk-Texte in Qdrant korrekt gespeichert sind und
der Note-Body aus Chunks rekonstruiert werden kann.

Funktionsweise:
---------------
1. Lädt alle Notes aus {prefix}_notes
2. Für jede Note:
   - Lädt zugehörige Chunks
   - Prüft, dass alle Chunks Text enthalten (text/content/raw)
   - Rekonstruiert Body aus Chunks (sortiert nach chunk_index/chunk_id)
   - Berechnet Coverage: gefundene Textsegmente / Fulltext-Länge
3. Validiert gegen notes.payload.fulltext

Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON mit Validierungs-Ergebnissen
  * summary: Gesamtstatistiken
  * per_note: Details pro Note
  * coverage: Durchschnittliche Coverage
- Exit-Code 0: Alle Prüfungen bestanden
- Exit-Code 1: Probleme gefunden

Verwendung:
-----------
- Validierung nach Importen
- Diagnose von Text-Rekonstruktionsproblemen
- Qualitätskontrolle

Hinweise:
---------
- Coverage-Toleranz für Overlaps (Standard: >= 0.90)
- Prüft Text-Felder: text (bevorzugt), content, raw

Aufruf:
-------
python3 -m scripts.verify_chunk_texts --prefix mindnet
python3 -m scripts.verify_chunk_texts --note-id concept-alpha --min-coverage 0.95

Parameter:
----------
--prefix TEXT      Collection-Präfix (Default: ENV COLLECTION_PREFIX)
--note-id ID       Nur eine bestimmte Note prüfen (optional)
--min-coverage F   Minimale Coverage (Default: 0.90)

Umgebungsvariablen:
-------------------
QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY, COLLECTION_PREFIX

Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0 (2025-09-09): Initial Release
"""
from __future__ import annotations
import argparse, json, os, re, sys
from typing import Dict, List, Tuple, Optional

from qdrant_client.http import models as rest

from app.core.qdrant import QdrantConfig, get_client, collection_names

def _chunk_sort_key(p: Dict, pid: str) -> Tuple[int,int,str]:
    # Primär: payload.chunk_index, sekundär: Nummer am Ende der ID (#cNN oder #NN), sonst 0
    ci = p.get("chunk_index")
    n = 0
    m = re.search(r'#c?(\d+)$', pid or "")
    if m:
        try:
            n = int(m.group(1))
        except Exception:
            n = 0
    return (ci if isinstance(ci, int) else 1_000_000 + n, n, pid)

def _choose_text(payload: Dict) -> Optional[str]:
    for k in ("text", "content", "raw"):
        v = payload.get(k)
        if isinstance(v, str) and v.strip():
            return v
    return None

def _coverage(fulltext: str, pieces: List[str]) -> float:
    """Berechnet die Abdeckungsquote der Stücke im Fulltext (sequenzielles Matching)."""
    if not fulltext:
        return 0.0 if pieces else 1.0
    cursor = 0
    covered = 0
    ft = fulltext
    for piece in pieces:
        if not piece:
            continue
        # Tolerant gegen Whitespace-Unterschiede: normalisieren nur \r\n→\n
        p = piece.replace("\r\n", "\n").replace("\r", "\n")
        idx = ft.find(p, cursor)
        if idx == -1:
            # Versuche ein paar Heuristiken: trimmen
            p2 = p.strip()
            if p2 and len(p2) > 8:
                idx = ft.find(p2, cursor)
        if idx != -1:
            covered += len(p)
            cursor = idx + len(p)
        # sonst: nicht abgedeckt
    return covered / max(1, len(ft))

def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--prefix", help="Collection-Prefix (Default: ENV COLLECTION_PREFIX oder 'mindnet')")
    ap.add_argument("--note-id", help="Nur eine bestimmte Note prüfen")
    ap.add_argument("--min-coverage", type=float, default=0.90, help="Mindestabdeckung durch Chunks (Default: 0.90)")
    args = ap.parse_args()

    cfg = QdrantConfig.from_env()
    if args.prefix:
        cfg.prefix = args.prefix
    notes_col, chunks_col, _ = collection_names(cfg.prefix)
    cli = get_client(cfg)

    # Notes abrufen (optional filter by note_id)
    notes_filter = None
    if args.note_id:
        notes_filter = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))])

    notes: List[Dict] = []
    off = None
    while True:
        pts, off = cli.scroll(collection_name=notes_col, scroll_filter=notes_filter,
                              with_payload=True, with_vectors=False, limit=256, offset=off)
        if not pts:
            break
        for p in pts:
            notes.append({"id": p.id, "payload": p.payload or {}})
        if off is None:
            break

    results: List[Dict] = []
    total_missing_text = 0
    total_notes_ok = 0
    for n in notes:
        pl = n["payload"]
        nid = pl.get("note_id") or pl.get("id") or n.get("id")
        fulltext = pl.get("fulltext") or ""
        # Chunks der Note holen
        f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
        chunks = []
        off = None
        while True:
            pts, off = cli.scroll(collection_name=chunks_col, scroll_filter=f,
                                  with_payload=True, with_vectors=False, limit=256, offset=off)
            if not pts:
                break
            for p in pts:
                chunks.append({"id": p.id, "payload": p.payload or {}})
            if off is None:
                break
        # sortieren
        chunks.sort(key=lambda c: _chunk_sort_key(c["payload"], c["id"]))
        texts = []
        missing_text = 0
        for c in chunks:
            t = _choose_text(c["payload"])
            if t is None:
                missing_text += 1
                texts.append("")
            else:
                texts.append(t)
        cov = _coverage(fulltext, texts)
        ok = (missing_text == 0) and (cov >= args.min_coverage or not fulltext)
        if ok:
            total_notes_ok += 1
        total_missing_text += missing_text
        results.append({
            "note_id": nid,
            "title": pl.get("title"),
            "chunks": len(chunks),
            "missing_chunk_texts": missing_text,
            "coverage": round(cov, 4),
            "has_fulltext": bool(fulltext),
            "ok": ok
        })

    out = {
        "collections": {"notes": notes_col, "chunks": chunks_col},
        "notes_checked": len(notes),
        "notes_ok": total_notes_ok,
        "total_missing_chunk_texts": total_missing_text,
        "min_coverage": args.min_coverage,
        "details": results
    }
    print(json.dumps(out, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    main()