scripts/prune_qdrant_vs_vault.py hinzugefügt

2025-09-08 17:37:08 +02:00 · 2025-09-08 17:37:08 +02:00 · 77732445ea
commit 77732445ea
parent 1b833f76ce
1 changed files with 232 additions and 0 deletions
--- a/scripts/prune_qdrant_vs_vault.py
+++ b/scripts/prune_qdrant_vs_vault.py
@ -0,0 +1,232 @@
+# scripts/prune_qdrant_vs_vault.py
+# -----------------------------------------------------------------------------
+# Name:        prune_qdrant_vs_vault.py
+# Version:     1.0.0 (2025-09-08)
+# Zweck:       Entfernt verwaiste Qdrant-Einträge (notes/chunks/edges), wenn
+#              die zugehörigen Markdown-Dateien im Vault nicht mehr existieren.
+#
+# Was es macht:
+#   - Liest alle note_id aus dem Vault (Frontmatter: id / note_id).
+#   - Liest alle note_id aus Qdrant (mindnet_notes).
+#   - Bildet die Differenz (nur in Qdrant vorhandene, im Vault fehlende).
+#   - Löscht für jede verwaiste note_id:
+#       * Notes-Point(s)
+#       * Alle Chunks der Note
+#       * Alle Edges, die auf diese Note referenzieren
+#         (source_id == note_id ODER target_id == note_id ODER
+#          source_note_id == note_id ODER target_note_id == note_id)
+#
+# Hinweise:
+#   - Kein globaler Delete. Nur betroffene note_id.
+#   - Dry-Run standardmäßig; tatsächliches Löschen erst mit --apply.
+#   - Interaktive Bestätigung (abschaltbar mit --yes).
+#
+# Aufruf:
+#   python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet
+#   python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply
+#   python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply --yes
+#
+# Voraussetzungen:
+#   - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
+#   - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py
+#
+# Änderungen:
+#   - 1.0.0: Erster Release.
+# -----------------------------------------------------------------------------
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Set, Tuple
+
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as rest
+
+from app.core.qdrant import QdrantConfig, get_client, collection_names
+from app.core.parser import parse_markdown_file  # nutzt euer bestehendes Parsing/Schema
+
+
+def read_vault_note_ids(vault_dir: Path) -> Set[str]:
+    note_ids: Set[str] = set()
+    for p in vault_dir.rglob("*.md"):
+        try:
+            parsed = parse_markdown_file(str(p))
+            fm = parsed.frontmatter if hasattr(parsed, "frontmatter") else (parsed.get("frontmatter") or {})
+            nid = fm.get("id") or fm.get("note_id")
+            if nid:
+                note_ids.add(str(nid))
+        except Exception:
+            # still und leise weiter – wir wollen robust sein
+            continue
+    return note_ids
+
+
+def qdrant_note_ids(client: QdrantClient, notes_col: str) -> Set[str]:
+    ids: Set[str] = set()
+    scroll_filter = None
+    offset = None
+    while True:
+        res = client.scroll(
+            collection_name=notes_col,
+            scroll_filter=scroll_filter,
+            offset=offset,
+            limit=256,
+            with_payload=True,
+            with_vectors=False,
+        )
+        points, next_offset = res
+        for pt in points:
+            pl = pt.payload or {}
+            nid = pl.get("note_id")
+            if nid:
+                ids.add(str(nid))
+        if next_offset is None:
+            break
+        offset = next_offset
+    return ids
+
+
+def delete_by_filter(client: QdrantClient, collection: str, flt: rest.Filter) -> int:
+    # Sammel erst IDs via scroll (robuster Überblick), lösche dann über point_ids (batch)
+    to_delete: List[int] = []
+    offset = None
+    while True:
+        pts, next_offset = client.scroll(
+            collection_name=collection,
+            scroll_filter=flt,
+            offset=offset,
+            limit=256,
+            with_payload=False,
+            with_vectors=False,
+        )
+        for pt in pts:
+            to_delete.append(pt.id)
+        if next_offset is None:
+            break
+        offset = next_offset
+    if not to_delete:
+        return 0
+    client.delete(collection_name=collection, points_selector=rest.PointIdsList(points=to_delete), wait=True)
+    return len(to_delete)
+
+
+def prune_for_note(
+    client: QdrantClient,
+    cols: Dict[str, str],
+    note_id: str,
+    dry_run: bool = True,
+) -> Dict[str, int]:
+    stats = {"notes": 0, "chunks": 0, "edges": 0}
+
+    # Notes löschen (Filter auf payload.note_id)
+    f_notes = rest.Filter(
+        must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
+    )
+
+    # Chunks löschen (Filter auf payload.note_id)
+    f_chunks = rest.Filter(
+        must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
+    )
+
+    # Edges löschen: mehrere Teilmengen erfassen
+    edge_filters = [
+        rest.Filter(must=[rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id))]),
+        rest.Filter(must=[rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id))]),
+        rest.Filter(must=[rest.FieldCondition(key="source_note_id", match=rest.MatchValue(value=note_id))]),
+        rest.Filter(must=[rest.FieldCondition(key="target_note_id", match=rest.MatchValue(value=note_id))]),
+    ]
+
+    if dry_run:
+        # Zähle nur, was *wäre* gelöscht worden
+        def count(flt: rest.Filter, col: str) -> int:
+            total = 0
+            offset = None
+            while True:
+                pts, next_offset = client.scroll(
+                    collection_name=col,
+                    scroll_filter=flt,
+                    offset=offset,
+                    limit=256,
+                    with_payload=False,
+                    with_vectors=False,
+                )
+                total += len(pts)
+                if next_offset is None:
+                    break
+                offset = next_offset
+            return total
+
+        stats["notes"] = count(f_notes, cols["notes"])
+        stats["chunks"] = count(f_chunks, cols["chunks"])
+        stats["edges"] = sum(count(f, cols["edges"]) for f in edge_filters)
+        return stats
+
+    # tatsächliches Löschen
+    stats["notes"] = delete_by_filter(client, cols["notes"], f_notes)
+    stats["chunks"] = delete_by_filter(client, cols["chunks"], f_chunks)
+    e_deleted = 0
+    for f in edge_filters:
+        e_deleted += delete_by_filter(client, cols["edges"], f)
+    stats["edges"] = e_deleted
+    return stats
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Prune Qdrant data for notes missing in the Vault.")
+    ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Root).")
+    ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet).")
+    ap.add_argument("--apply", action="store_true", help="Ohne diesen Schalter wird nur ein Dry-Run gemacht.")
+    ap.add_argument("--yes", action="store_true", help="Ohne Nachfrage ausführen.")
+    args = ap.parse_args()
+
+    vault_root = Path(args.vault).resolve()
+    if not vault_root.exists():
+        print(f"Vault-Verzeichnis nicht gefunden: {vault_root}", file=sys.stderr)
+        sys.exit(1)
+
+    cfg = QdrantConfig()
+    client = get_client(cfg)
+    cols = collection_names(args.prefix)
+
+    vault_ids = read_vault_note_ids(vault_root)
+    qdrant_ids = qdrant_note_ids(client, cols["notes"])
+    orphans = sorted(qdrant_ids - vault_ids)
+
+    preview = {
+        "mode": "APPLY" if args.apply else "DRY-RUN",
+        "prefix": args.prefix,
+        "vault_root": str(vault_root),
+        "collections": cols,
+        "counts": {
+            "vault_note_ids": len(vault_ids),
+            "qdrant_note_ids": len(qdrant_ids),
+            "orphans": len(orphans),
+        },
+        "orphans_sample": orphans[:20],
+    }
+    print(json.dumps(preview, ensure_ascii=False, indent=2))
+
+    if not orphans:
+        print("Keine verwaisten Einträge gefunden. Nichts zu tun.")
+        return
+
+    if not args.yes:
+        resp = input("\nFortfahren mit dem oben angezeigten Modus? (yes/no): ").strip().lower()
+        if resp not in ("y", "yes", "j", "ja"):
+            print("Abgebrochen.")
+            return
+
+    total_stats = {"notes": 0, "chunks": 0, "edges": 0}
+    for nid in orphans:
+        s = prune_for_note(client, cols, nid, dry_run=(not args.apply))
+        total_stats["notes"] += s["notes"]
+        total_stats["chunks"] += s["chunks"]
+        total_stats["edges"] += s["edges"]
+
+    print(json.dumps({"deleted": total_stats}, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()