#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ FILE: scripts/verify_chunk_texts.py VERSION: 2.1.0 (2025-12-15) STATUS: Active COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b) Zweck: ------- Verifiziert, dass Chunk-Texte in Qdrant korrekt gespeichert sind und der Note-Body aus Chunks rekonstruiert werden kann. Funktionsweise: --------------- 1. Lädt alle Notes aus {prefix}_notes 2. Für jede Note: - Lädt zugehörige Chunks - Prüft, dass alle Chunks Text enthalten (text/content/raw) - Rekonstruiert Body aus Chunks (sortiert nach chunk_index/chunk_id) - Berechnet Coverage: gefundene Textsegmente / Fulltext-Länge 3. Validiert gegen notes.payload.fulltext Ergebnis-Interpretation: ------------------------ - Ausgabe: JSON mit Validierungs-Ergebnissen * summary: Gesamtstatistiken * per_note: Details pro Note * coverage: Durchschnittliche Coverage - Exit-Code 0: Alle Prüfungen bestanden - Exit-Code 1: Probleme gefunden Verwendung: ----------- - Validierung nach Importen - Diagnose von Text-Rekonstruktionsproblemen - Qualitätskontrolle Hinweise: --------- - Coverage-Toleranz für Overlaps (Standard: >= 0.90) - Prüft Text-Felder: text (bevorzugt), content, raw Aufruf: ------- python3 -m scripts.verify_chunk_texts --prefix mindnet python3 -m scripts.verify_chunk_texts --note-id concept-alpha --min-coverage 0.95 Parameter: ---------- --prefix TEXT Collection-Präfix (Default: ENV COLLECTION_PREFIX) --note-id ID Nur eine bestimmte Note prüfen (optional) --min-coverage F Minimale Coverage (Default: 0.90) Umgebungsvariablen: ------------------- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY, COLLECTION_PREFIX Änderungen: ----------- v2.1.0 (2025-12-15): Dokumentation aktualisiert v1.0.0 (2025-09-09): Initial Release """ from __future__ import annotations import argparse, json, os, re, sys from typing import Dict, List, Tuple, Optional from qdrant_client.http import models as rest from app.core.database.qdrant import QdrantConfig, get_client, collection_names def _chunk_sort_key(p: Dict, pid: str) -> Tuple[int,int,str]: # Primär: payload.chunk_index, sekundär: Nummer am Ende der ID (#cNN oder #NN), sonst 0 ci = p.get("chunk_index") n = 0 m = re.search(r'#c?(\d+)$', pid or "") if m: try: n = int(m.group(1)) except Exception: n = 0 return (ci if isinstance(ci, int) else 1_000_000 + n, n, pid) def _choose_text(payload: Dict) -> Optional[str]: for k in ("text", "content", "raw"): v = payload.get(k) if isinstance(v, str) and v.strip(): return v return None def _coverage(fulltext: str, pieces: List[str]) -> float: """Berechnet die Abdeckungsquote der Stücke im Fulltext (sequenzielles Matching).""" if not fulltext: return 0.0 if pieces else 1.0 cursor = 0 covered = 0 ft = fulltext for piece in pieces: if not piece: continue # Tolerant gegen Whitespace-Unterschiede: normalisieren nur \r\n→\n p = piece.replace("\r\n", "\n").replace("\r", "\n") idx = ft.find(p, cursor) if idx == -1: # Versuche ein paar Heuristiken: trimmen p2 = p.strip() if p2 and len(p2) > 8: idx = ft.find(p2, cursor) if idx != -1: covered += len(p) cursor = idx + len(p) # sonst: nicht abgedeckt return covered / max(1, len(ft)) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--prefix", help="Collection-Prefix (Default: ENV COLLECTION_PREFIX oder 'mindnet')") ap.add_argument("--note-id", help="Nur eine bestimmte Note prüfen") ap.add_argument("--min-coverage", type=float, default=0.90, help="Mindestabdeckung durch Chunks (Default: 0.90)") args = ap.parse_args() cfg = QdrantConfig.from_env() if args.prefix: cfg.prefix = args.prefix notes_col, chunks_col, _ = collection_names(cfg.prefix) cli = get_client(cfg) # Notes abrufen (optional filter by note_id) notes_filter = None if args.note_id: notes_filter = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))]) notes: List[Dict] = [] off = None while True: pts, off = cli.scroll(collection_name=notes_col, scroll_filter=notes_filter, with_payload=True, with_vectors=False, limit=256, offset=off) if not pts: break for p in pts: notes.append({"id": p.id, "payload": p.payload or {}}) if off is None: break results: List[Dict] = [] total_missing_text = 0 total_notes_ok = 0 for n in notes: pl = n["payload"] nid = pl.get("note_id") or pl.get("id") or n.get("id") fulltext = pl.get("fulltext") or "" # Chunks der Note holen f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))]) chunks = [] off = None while True: pts, off = cli.scroll(collection_name=chunks_col, scroll_filter=f, with_payload=True, with_vectors=False, limit=256, offset=off) if not pts: break for p in pts: chunks.append({"id": p.id, "payload": p.payload or {}}) if off is None: break # sortieren chunks.sort(key=lambda c: _chunk_sort_key(c["payload"], c["id"])) texts = [] missing_text = 0 for c in chunks: t = _choose_text(c["payload"]) if t is None: missing_text += 1 texts.append("") else: texts.append(t) cov = _coverage(fulltext, texts) ok = (missing_text == 0) and (cov >= args.min_coverage or not fulltext) if ok: total_notes_ok += 1 total_missing_text += missing_text results.append({ "note_id": nid, "title": pl.get("title"), "chunks": len(chunks), "missing_chunk_texts": missing_text, "coverage": round(cov, 4), "has_fulltext": bool(fulltext), "ok": ok }) out = { "collections": {"notes": notes_col, "chunks": chunks_col}, "notes_checked": len(notes), "notes_ok": total_notes_ok, "total_missing_chunk_texts": total_missing_text, "min_coverage": args.min_coverage, "details": results } print(json.dumps(out, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()