scripts/prune_qdrant_vs_vault.py hinzugefügt
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s

This commit is contained in:
Lars 2025-09-08 17:37:08 +02:00
parent 1b833f76ce
commit 77732445ea

View File

@ -0,0 +1,232 @@
# scripts/prune_qdrant_vs_vault.py
# -----------------------------------------------------------------------------
# Name: prune_qdrant_vs_vault.py
# Version: 1.0.0 (2025-09-08)
# Zweck: Entfernt verwaiste Qdrant-Einträge (notes/chunks/edges), wenn
# die zugehörigen Markdown-Dateien im Vault nicht mehr existieren.
#
# Was es macht:
# - Liest alle note_id aus dem Vault (Frontmatter: id / note_id).
# - Liest alle note_id aus Qdrant (mindnet_notes).
# - Bildet die Differenz (nur in Qdrant vorhandene, im Vault fehlende).
# - Löscht für jede verwaiste note_id:
# * Notes-Point(s)
# * Alle Chunks der Note
# * Alle Edges, die auf diese Note referenzieren
# (source_id == note_id ODER target_id == note_id ODER
# source_note_id == note_id ODER target_note_id == note_id)
#
# Hinweise:
# - Kein globaler Delete. Nur betroffene note_id.
# - Dry-Run standardmäßig; tatsächliches Löschen erst mit --apply.
# - Interaktive Bestätigung (abschaltbar mit --yes).
#
# Aufruf:
# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet
# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply
# python3 -m scripts.prune_qdrant_vs_vault --vault ./vault --prefix mindnet --apply --yes
#
# Voraussetzungen:
# - Ausführung im aktivierten venv empfohlen: source .venv/bin/activate
# - Qdrant läuft lokal (oder URL/API-Key in ENV), siehe app/core/qdrant.py
#
# Änderungen:
# - 1.0.0: Erster Release.
# -----------------------------------------------------------------------------
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, List, Set, Tuple
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client, collection_names
from app.core.parser import parse_markdown_file # nutzt euer bestehendes Parsing/Schema
def read_vault_note_ids(vault_dir: Path) -> Set[str]:
note_ids: Set[str] = set()
for p in vault_dir.rglob("*.md"):
try:
parsed = parse_markdown_file(str(p))
fm = parsed.frontmatter if hasattr(parsed, "frontmatter") else (parsed.get("frontmatter") or {})
nid = fm.get("id") or fm.get("note_id")
if nid:
note_ids.add(str(nid))
except Exception:
# still und leise weiter wir wollen robust sein
continue
return note_ids
def qdrant_note_ids(client: QdrantClient, notes_col: str) -> Set[str]:
ids: Set[str] = set()
scroll_filter = None
offset = None
while True:
res = client.scroll(
collection_name=notes_col,
scroll_filter=scroll_filter,
offset=offset,
limit=256,
with_payload=True,
with_vectors=False,
)
points, next_offset = res
for pt in points:
pl = pt.payload or {}
nid = pl.get("note_id")
if nid:
ids.add(str(nid))
if next_offset is None:
break
offset = next_offset
return ids
def delete_by_filter(client: QdrantClient, collection: str, flt: rest.Filter) -> int:
# Sammel erst IDs via scroll (robuster Überblick), lösche dann über point_ids (batch)
to_delete: List[int] = []
offset = None
while True:
pts, next_offset = client.scroll(
collection_name=collection,
scroll_filter=flt,
offset=offset,
limit=256,
with_payload=False,
with_vectors=False,
)
for pt in pts:
to_delete.append(pt.id)
if next_offset is None:
break
offset = next_offset
if not to_delete:
return 0
client.delete(collection_name=collection, points_selector=rest.PointIdsList(points=to_delete), wait=True)
return len(to_delete)
def prune_for_note(
client: QdrantClient,
cols: Dict[str, str],
note_id: str,
dry_run: bool = True,
) -> Dict[str, int]:
stats = {"notes": 0, "chunks": 0, "edges": 0}
# Notes löschen (Filter auf payload.note_id)
f_notes = rest.Filter(
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
)
# Chunks löschen (Filter auf payload.note_id)
f_chunks = rest.Filter(
must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]
)
# Edges löschen: mehrere Teilmengen erfassen
edge_filters = [
rest.Filter(must=[rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id))]),
rest.Filter(must=[rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id))]),
rest.Filter(must=[rest.FieldCondition(key="source_note_id", match=rest.MatchValue(value=note_id))]),
rest.Filter(must=[rest.FieldCondition(key="target_note_id", match=rest.MatchValue(value=note_id))]),
]
if dry_run:
# Zähle nur, was *wäre* gelöscht worden
def count(flt: rest.Filter, col: str) -> int:
total = 0
offset = None
while True:
pts, next_offset = client.scroll(
collection_name=col,
scroll_filter=flt,
offset=offset,
limit=256,
with_payload=False,
with_vectors=False,
)
total += len(pts)
if next_offset is None:
break
offset = next_offset
return total
stats["notes"] = count(f_notes, cols["notes"])
stats["chunks"] = count(f_chunks, cols["chunks"])
stats["edges"] = sum(count(f, cols["edges"]) for f in edge_filters)
return stats
# tatsächliches Löschen
stats["notes"] = delete_by_filter(client, cols["notes"], f_notes)
stats["chunks"] = delete_by_filter(client, cols["chunks"], f_chunks)
e_deleted = 0
for f in edge_filters:
e_deleted += delete_by_filter(client, cols["edges"], f)
stats["edges"] = e_deleted
return stats
def main():
ap = argparse.ArgumentParser(description="Prune Qdrant data for notes missing in the Vault.")
ap.add_argument("--vault", required=True, help="Pfad zum Vault-Verzeichnis (Root).")
ap.add_argument("--prefix", default="mindnet", help="Collections-Präfix (Default: mindnet).")
ap.add_argument("--apply", action="store_true", help="Ohne diesen Schalter wird nur ein Dry-Run gemacht.")
ap.add_argument("--yes", action="store_true", help="Ohne Nachfrage ausführen.")
args = ap.parse_args()
vault_root = Path(args.vault).resolve()
if not vault_root.exists():
print(f"Vault-Verzeichnis nicht gefunden: {vault_root}", file=sys.stderr)
sys.exit(1)
cfg = QdrantConfig()
client = get_client(cfg)
cols = collection_names(args.prefix)
vault_ids = read_vault_note_ids(vault_root)
qdrant_ids = qdrant_note_ids(client, cols["notes"])
orphans = sorted(qdrant_ids - vault_ids)
preview = {
"mode": "APPLY" if args.apply else "DRY-RUN",
"prefix": args.prefix,
"vault_root": str(vault_root),
"collections": cols,
"counts": {
"vault_note_ids": len(vault_ids),
"qdrant_note_ids": len(qdrant_ids),
"orphans": len(orphans),
},
"orphans_sample": orphans[:20],
}
print(json.dumps(preview, ensure_ascii=False, indent=2))
if not orphans:
print("Keine verwaisten Einträge gefunden. Nichts zu tun.")
return
if not args.yes:
resp = input("\nFortfahren mit dem oben angezeigten Modus? (yes/no): ").strip().lower()
if resp not in ("y", "yes", "j", "ja"):
print("Abgebrochen.")
return
total_stats = {"notes": 0, "chunks": 0, "edges": 0}
for nid in orphans:
s = prune_for_note(client, cols, nid, dry_run=(not args.apply))
total_stats["notes"] += s["notes"]
total_stats["chunks"] += s["chunks"]
total_stats["edges"] += s["edges"]
print(json.dumps({"deleted": total_stats}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()