mindnet/scripts/prune_qdrant_vs_vault.py
Lars 77732445ea
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
scripts/prune_qdrant_vs_vault.py hinzugefügt
2025-09-08 17:37:08 +02:00

233 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# scripts/prune_qdrant_vs_vault.py
# -----------------------------------------------------------------------------
# Name: prune_qdrant_vs_vault.py
# Version: 1.0.0 (2025-09-08)
# Zweck: Entfernt verwaiste Qdrant-Einträge (notes/chunks/edges), wenn
# die zugehörigen Markdown-Dateien im Vault nicht mehr existieren.
#
# Was es macht:
# - Liest alle note_id aus dem Vault (Frontmatter: id / note_id).
# - Liest alle note_id aus Qdrant (mindnet_notes).
# - Bildet die Differenz (nur in Qdrant vorhandene, im Vault fehlende).
# - Löscht für jede verwaiste note_id:
# * Notes-Point(s)
# * Alle Chunks der Note
# * Alle Edges, die auf diese Note referenzieren
# (source_id == note_id ODER target_id == note_id ODER
# source_note_id == note_id ODER target_note_id == note_id)
#
# Hinweise:
# - Kein globaler Delete. Nur betroffene note_id.
# - Dry-Run standardmäßig; tatsächliches Löschen erst mit --apply.
# - Interaktive Bestätigung (abschaltbar mit --yes).
#
# Aufruf:
# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet
# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply
# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply --yes
#
# Voraussetzungen:
# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
# - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py
#
# Änderungen:
# - 1.0.0: Erster Release.
# -----------------------------------------------------------------------------
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client, collection_names
from app.core.parser import parse_markdown_file # nutzt euer bestehendes Parsing/Schema
def read_vault_note_ids(vault_dir: Path) -> Set[str]:
note_ids: Set[str] = set()
for p in vault_dir.rglob("*.md"):
try:
parsed = parse_markdown_file(str(p))
fm = parsed.frontmatter if hasattr(parsed, "frontmatter") else (parsed.get("frontmatter") or {})
nid = fm.get("id") or fm.get("note_id")
if nid:
note_ids.add(str(nid))
except Exception:
# still und leise weiter wir wollen robust sein
continue
return note_ids
def qdrant_note_ids(client: QdrantClient, notes_col: str) -> Set[str]:
ids: Set[str] = set()
scroll_filter = None
offset = None
while True:
res = client.scroll(
collection_name=notes_col,
scroll_filter=scroll_filter,
offset=offset,
limit=256,
with_payload=True,
with_vectors=False,
)
points, next_offset = res
for pt in points:
pl = pt.payload or {}
nid = pl.get("note_id")
if nid:
ids.add(str(nid))
if next_offset is None:
break
offset = next_offset
return ids
def delete_by_filter(client: QdrantClient, collection: str, flt: rest.Filter) -> int:
# Sammel erst IDs via scroll (robuster Überblick), lösche dann über point_ids (batch)
to_delete: List[int] = []
offset = None
while True:
pts, next_offset = client.scroll(
collection_name=collection,
scroll_filter=flt,
offset=offset,
limit=256,
with_payload=False,
with_vectors=False,
)
for pt in pts:
to_delete.append(pt.id)
if next_offset is None:
break
offset = next_offset
if not to_delete:
return 0
client.delete(collection_name=collection, points_selector=rest.PointIdsList(points=to_delete), wait=True)
return len(to_delete)
def prune_for_note(
client: QdrantClient,
cols: Dict[str, str],
note_id: str,
dry_run: bool = True,
) -> Dict[str, int]:
stats = {"notes": 0, "chunks": 0, "edges": 0}
# Notes löschen (Filter auf payload.note_id)
f_notes = rest.Filter(
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
)
# Chunks löschen (Filter auf payload.note_id)
f_chunks = rest.Filter(
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
)
# Edges löschen: mehrere Teilmengen erfassen
edge_filters = [
rest.Filter(must=[rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id))]),
rest.Filter(must=[rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id))]),
rest.Filter(must=[rest.FieldCondition(key="source_note_id", match=rest.MatchValue(value=note_id))]),
rest.Filter(must=[rest.FieldCondition(key="target_note_id", match=rest.MatchValue(value=note_id))]),
]
if dry_run:
# Zähle nur, was *wäre* gelöscht worden
def count(flt: rest.Filter, col: str) -> int:
total = 0
offset = None
while True:
pts, next_offset = client.scroll(
collection_name=col,
scroll_filter=flt,
offset=offset,
limit=256,
with_payload=False,
with_vectors=False,
)
total += len(pts)
if next_offset is None:
break
offset = next_offset
return total
stats["notes"] = count(f_notes, cols["notes"])
stats["chunks"] = count(f_chunks, cols["chunks"])
stats["edges"] = sum(count(f, cols["edges"]) for f in edge_filters)
return stats
# tatsächliches Löschen
stats["notes"] = delete_by_filter(client, cols["notes"], f_notes)
stats["chunks"] = delete_by_filter(client, cols["chunks"], f_chunks)
e_deleted = 0
for f in edge_filters:
e_deleted += delete_by_filter(client, cols["edges"], f)
stats["edges"] = e_deleted
return stats
def main():
ap = argparse.ArgumentParser(description="Prune Qdrant data for notes missing in the Vault.")
ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Root).")
ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet).")
ap.add_argument("--apply", action="store_true", help="Ohne diesen Schalter wird nur ein Dry-Run gemacht.")
ap.add_argument("--yes", action="store_true", help="Ohne Nachfrage ausführen.")
args = ap.parse_args()
vault_root = Path(args.vault).resolve()
if not vault_root.exists():
print(f"Vault-Verzeichnis nicht gefunden: {vault_root}", file=sys.stderr)
sys.exit(1)
cfg = QdrantConfig()
client = get_client(cfg)
cols = collection_names(args.prefix)
vault_ids = read_vault_note_ids(vault_root)
qdrant_ids = qdrant_note_ids(client, cols["notes"])
orphans = sorted(qdrant_ids - vault_ids)
preview = {
"mode": "APPLY" if args.apply else "DRY-RUN",
"prefix": args.prefix,
"vault_root": str(vault_root),
"collections": cols,
"counts": {
"vault_note_ids": len(vault_ids),
"qdrant_note_ids": len(qdrant_ids),
"orphans": len(orphans),
},
"orphans_sample": orphans[:20],
}
print(json.dumps(preview, ensure_ascii=False, indent=2))
if not orphans:
print("Keine verwaisten Einträge gefunden. Nichts zu tun.")
return
if not args.yes:
resp = input("\nFortfahren mit dem oben angezeigten Modus? (yes/no): ").strip().lower()
if resp not in ("y", "yes", "j", "ja"):
print("Abgebrochen.")
return
total_stats = {"notes": 0, "chunks": 0, "edges": 0}
for nid in orphans:
s = prune_for_note(client, cols, nid, dry_run=(not args.apply))
total_stats["notes"] += s["notes"]
total_stats["chunks"] += s["chunks"]
total_stats["edges"] += s["edges"]
print(json.dumps({"deleted": total_stats}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()