diff --git a/scripts/prune_qdrant_vs_vault.py b/scripts/prune_qdrant_vs_vault.py new file mode 100644 index 0000000..59e0905 --- /dev/null +++ b/scripts/prune_qdrant_vs_vault.py @@ -0,0 +1,232 @@ +# scripts/prune_qdrant_vs_vault.py +# ----------------------------------------------------------------------------- +# Name: prune_qdrant_vs_vault.py +# Version: 1.0.0 (2025-09-08) +# Zweck: Entfernt verwaiste Qdrant-Einträge (notes/chunks/edges), wenn +# die zugehörigen Markdown-Dateien im Vault nicht mehr existieren. +# +# Was es macht: +# - Liest alle note_id aus dem Vault (Frontmatter: id / note_id). +# - Liest alle note_id aus Qdrant (mindnet_notes). +# - Bildet die Differenz (nur in Qdrant vorhandene, im Vault fehlende). +# - Löscht für jede verwaiste note_id: +# * Notes-Point(s) +# * Alle Chunks der Note +# * Alle Edges, die auf diese Note referenzieren +# (source_id == note_id ODER target_id == note_id ODER +# source_note_id == note_id ODER target_note_id == note_id) +# +# Hinweise: +# - Kein globaler Delete. Nur betroffene note_id. +# - Dry-Run standardmäßig; tatsächliches Löschen erst mit --apply. +# - Interaktive Bestätigung (abschaltbar mit --yes). +# +# Aufruf: +# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet +# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply +# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply --yes +# +# Voraussetzungen: +# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate +# - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py +# +# Änderungen: +# - 1.0.0: Erster Release. +# ----------------------------------------------------------------------------- + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Dict, List, Set, Tuple + +from qdrant_client import QdrantClient +from qdrant_client.http import models as rest + +from app.core.qdrant import QdrantConfig, get_client, collection_names +from app.core.parser import parse_markdown_file # nutzt euer bestehendes Parsing/Schema + + +def read_vault_note_ids(vault_dir: Path) -> Set[str]: + note_ids: Set[str] = set() + for p in vault_dir.rglob("*.md"): + try: + parsed = parse_markdown_file(str(p)) + fm = parsed.frontmatter if hasattr(parsed, "frontmatter") else (parsed.get("frontmatter") or {}) + nid = fm.get("id") or fm.get("note_id") + if nid: + note_ids.add(str(nid)) + except Exception: + # still und leise weiter – wir wollen robust sein + continue + return note_ids + + +def qdrant_note_ids(client: QdrantClient, notes_col: str) -> Set[str]: + ids: Set[str] = set() + scroll_filter = None + offset = None + while True: + res = client.scroll( + collection_name=notes_col, + scroll_filter=scroll_filter, + offset=offset, + limit=256, + with_payload=True, + with_vectors=False, + ) + points, next_offset = res + for pt in points: + pl = pt.payload or {} + nid = pl.get("note_id") + if nid: + ids.add(str(nid)) + if next_offset is None: + break + offset = next_offset + return ids + + +def delete_by_filter(client: QdrantClient, collection: str, flt: rest.Filter) -> int: + # Sammel erst IDs via scroll (robuster Überblick), lösche dann über point_ids (batch) + to_delete: List[int] = [] + offset = None + while True: + pts, next_offset = client.scroll( + collection_name=collection, + scroll_filter=flt, + offset=offset, + limit=256, + with_payload=False, + with_vectors=False, + ) + for pt in pts: + to_delete.append(pt.id) + if next_offset is None: + break + offset = next_offset + if not to_delete: + return 0 + client.delete(collection_name=collection, points_selector=rest.PointIdsList(points=to_delete), wait=True) + return len(to_delete) + + +def prune_for_note( + client: QdrantClient, + cols: Dict[str, str], + note_id: str, + dry_run: bool = True, +) -> Dict[str, int]: + stats = {"notes": 0, "chunks": 0, "edges": 0} + + # Notes löschen (Filter auf payload.note_id) + f_notes = rest.Filter( + must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))] + ) + + # Chunks löschen (Filter auf payload.note_id) + f_chunks = rest.Filter( + must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))] + ) + + # Edges löschen: mehrere Teilmengen erfassen + edge_filters = [ + rest.Filter(must=[rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id))]), + rest.Filter(must=[rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id))]), + rest.Filter(must=[rest.FieldCondition(key="source_note_id", match=rest.MatchValue(value=note_id))]), + rest.Filter(must=[rest.FieldCondition(key="target_note_id", match=rest.MatchValue(value=note_id))]), + ] + + if dry_run: + # Zähle nur, was *wäre* gelöscht worden + def count(flt: rest.Filter, col: str) -> int: + total = 0 + offset = None + while True: + pts, next_offset = client.scroll( + collection_name=col, + scroll_filter=flt, + offset=offset, + limit=256, + with_payload=False, + with_vectors=False, + ) + total += len(pts) + if next_offset is None: + break + offset = next_offset + return total + + stats["notes"] = count(f_notes, cols["notes"]) + stats["chunks"] = count(f_chunks, cols["chunks"]) + stats["edges"] = sum(count(f, cols["edges"]) for f in edge_filters) + return stats + + # tatsächliches Löschen + stats["notes"] = delete_by_filter(client, cols["notes"], f_notes) + stats["chunks"] = delete_by_filter(client, cols["chunks"], f_chunks) + e_deleted = 0 + for f in edge_filters: + e_deleted += delete_by_filter(client, cols["edges"], f) + stats["edges"] = e_deleted + return stats + + +def main(): + ap = argparse.ArgumentParser(description="Prune Qdrant data for notes missing in the Vault.") + ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Root).") + ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet).") + ap.add_argument("--apply", action="store_true", help="Ohne diesen Schalter wird nur ein Dry-Run gemacht.") + ap.add_argument("--yes", action="store_true", help="Ohne Nachfrage ausführen.") + args = ap.parse_args() + + vault_root = Path(args.vault).resolve() + if not vault_root.exists(): + print(f"Vault-Verzeichnis nicht gefunden: {vault_root}", file=sys.stderr) + sys.exit(1) + + cfg = QdrantConfig() + client = get_client(cfg) + cols = collection_names(args.prefix) + + vault_ids = read_vault_note_ids(vault_root) + qdrant_ids = qdrant_note_ids(client, cols["notes"]) + orphans = sorted(qdrant_ids - vault_ids) + + preview = { + "mode": "APPLY" if args.apply else "DRY-RUN", + "prefix": args.prefix, + "vault_root": str(vault_root), + "collections": cols, + "counts": { + "vault_note_ids": len(vault_ids), + "qdrant_note_ids": len(qdrant_ids), + "orphans": len(orphans), + }, + "orphans_sample": orphans[:20], + } + print(json.dumps(preview, ensure_ascii=False, indent=2)) + + if not orphans: + print("Keine verwaisten Einträge gefunden. Nichts zu tun.") + return + + if not args.yes: + resp = input("\nFortfahren mit dem oben angezeigten Modus? (yes/no): ").strip().lower() + if resp not in ("y", "yes", "j", "ja"): + print("Abgebrochen.") + return + + total_stats = {"notes": 0, "chunks": 0, "edges": 0} + for nid in orphans: + s = prune_for_note(client, cols, nid, dry_run=(not args.apply)) + total_stats["notes"] += s["notes"] + total_stats["chunks"] += s["chunks"] + total_stats["edges"] += s["edges"] + + print(json.dumps({"deleted": total_stats}, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main()