#!/usr/bin/env python3 # scripts/resolve_unresolved_references.py """ resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen Version: 1.0.0 (2025-09-05) Zweck ------ - Findet Edges in {prefix}_edges mit payload.status=="unresolved" und versucht, den Zielknoten anhand bereits vorhandener Notes in {prefix}_notes aufzulösen. - Aktualisiert die Edges (setzt target_id, entfernt status, setzt resolution), und erzeugt – NUR für Note-Level 'references' – die symmetrische 'backlink'-Kante. Warum? ------ - Beim ersten Import können Links auf (noch) nicht existierende Notizen zeigen. - Sobald die Zielnotiz später existiert, kann dieses Skript die Kanten reparieren. Aufruf ------ # Dry-Run (Standard): python3 -m scripts.resolve_unresolved_references --prefix mindnet # Anwenden: python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply # Optional: nur X Edges anfassen python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply --limit 500 Parameter --------- --prefix : Collection-Prefix (Default: aus Env COLLECION_PREFIX oder "mindnet") --apply : Änderungen tatsächlich schreiben (ohne --apply = Dry-Run) --limit : Max. Anzahl unaufgelöster Edges, die in diesem Lauf bearbeitet werden (Default: keine Begrenzung) --batch : Upsert-Batchgröße (Default: 512) Voraussetzungen / Hinweise -------------------------- - Bitte im aktivierten venv laufen lassen (deine Umgebung: `.venv`). - Qdrant-URL/Key/Prefix/Vektor-Dim werden wie üblich aus ENV gelesen (sieh app/core/qdrant.py). # noqa - Nutzt die vorhandenen Utilities: - app/core/qdrant.py (Client/Collections) - app/core/qdrant_points.py (points_for_edges/upsert_batch) - app/core/derive_edges.py (build_note_index/resolve_target) Änderungshistorie ----------------- 1.0.0 Erstveröffentlichung. """ from __future__ import annotations import argparse import json from typing import Any, Dict, List, Tuple, Iterable from qdrant_client import QdrantClient from qdrant_client.http import models as rest from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names # :contentReference[oaicite:3]{index=3} from app.core.qdrant_points import points_for_edges, upsert_batch # :contentReference[oaicite:4]{index=4} from app.core.derive_edges import build_note_index, resolve_target # :contentReference[oaicite:5]{index=5} def _scroll(client: QdrantClient, **kwargs): """ Wrapper um qdrant_client.scroll() für unterschiedliche Client-Versionen: neuere: (points, next_offset) ältere: (points, next_page_offset, _) """ res = client.scroll(**kwargs) if isinstance(res, tuple): if len(res) == 2: points, next_off = res else: # ältere Signatur: (points, next_off, _) points, next_off, _ = res[0], res[1], res[2] else: # sehr alte Clients -> konservativ behandeln points, next_off = res, None return points, next_off def _load_all_notes(client: QdrantClient, notes_col: str) -> List[Dict[str, Any]]: notes: List[Dict[str, Any]] = [] next_off = None while True: pts, next_off = _scroll( client, collection_name=notes_col, with_payload=True, with_vectors=False, limit=1024, offset=next_off, ) for p in pts or []: pl = getattr(p, "payload", {}) or {} # Erwartet Felder: note_id, title, path etc. (gemäß Schema) # :contentReference[oaicite:6]{index=6} if pl.get("note_id"): notes.append(pl) if not next_off: break return notes def _iter_unresolved_edges(client: QdrantClient, edges_col: str) -> Iterable[rest.Record]: """ Liefert alle Edge-Records mit payload.status == 'unresolved' und 'target_label' (string). """ f = rest.Filter( must=[ rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), ] ) next_off = None while True: pts, next_off = _scroll( client, collection_name=edges_col, scroll_filter=f, with_payload=True, with_vectors=False, limit=1024, offset=next_off, ) for p in pts or []: pl = getattr(p, "payload", {}) or {} if isinstance(pl.get("target_label"), str): yield p if not next_off: break def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]: """ Baue eine 'backlink'-Edge-Payload source <- target (note-level). """ e = { "kind": "backlink", "source_id": target_note_id, "target_id": source_note_id, } # Metafelder aus dem Original übernehmen (ohne status) copy_keys = ["raw", "alias", "heading", "resolution"] for k in copy_keys: if k in extra: e[k] = extra[k] return e def main(): ap = argparse.ArgumentParser() ap.add_argument("--prefix", help="Collection-Prefix (Default: Env/COLLECTION_PREFIX oder 'mindnet')") ap.add_argument("--apply", action="store_true", help="Änderungen schreiben (ohne Flag = Dry-Run)") ap.add_argument("--limit", type=int, default=0, help="Max. Anzahl unaufgelöster Edges bearbeiten (0 = alle)") ap.add_argument("--batch", type=int, default=512, help="Upsert-Batchgröße") args = ap.parse_args() # Qdrant-Setup cfg = QdrantConfig.from_env() if args.prefix: cfg.prefix = args.prefix client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) # sorgt u. a. für 1D-Vektor-Collection bei Edges :contentReference[oaicite:7]{index=7} notes_col, _, edges_col = collection_names(cfg.prefix) # :contentReference[oaicite:8]{index=8} # Notes laden & Index bauen notes = _load_all_notes(client, notes_col) idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) :contentReference[oaicite:9]{index=9} # Unresolved-Edges scannen to_fix: List[dict] = [] backlinks: List[dict] = [] processed = 0 resolved = 0 for rec in _iter_unresolved_edges(client, edges_col): if args.limit and processed >= args.limit: break processed += 1 pl = dict(rec.payload or {}) kind = pl.get("kind") or "references" src = pl.get("source_id") tgt_label = pl.get("target_label") or pl.get("target_id") # Fallback # Zielauflösung resolved_id, how = resolve_target(str(tgt_label), idx) # :contentReference[oaicite:10]{index=10} if not resolved_id: continue # weiterhin unresolved # Edge-Update new_pl = dict(pl) new_pl["target_id"] = resolved_id new_pl["resolution"] = how if "status" in new_pl: del new_pl["status"] # ID stabil lassen -> points_for_edges erzeugt UUID aus edge_id/Fallback :contentReference[oaicite:11]{index=11} if "edge_id" not in new_pl: # stabiler Key aus (kind, src, tgt, evtl. seq) seq = new_pl.get("seq") or new_pl.get("order") or "" new_pl["edge_id"] = f"{kind}:{src}->{resolved_id}#{seq}" to_fix.append(new_pl) resolved += 1 # Nur bei Note-Level references (nicht references_at) -> Backlink erzeugen if kind == "references": extra = {k: new_pl.get(k) for k in ("raw", "alias", "heading")} extra["resolution"] = how backlinks.append(_make_backlink(source_note_id=src, target_note_id=resolved_id, extra=extra)) # Ergebnis ausgeben summary = { "prefix": cfg.prefix, "scanned_unresolved": processed, "resolved": resolved, "backlinks_to_create": len(backlinks), "apply": bool(args.apply), } print(json.dumps(summary, ensure_ascii=False)) if not args.apply: return # Upserts (in Batches) def _batched(items: List[dict], n: int) -> Iterable[List[dict]]: for i in range(0, len(items), n): yield items[i : i + n] # 1) Updates für reparierte Edges for chunk in _batched(to_fix, args.batch): col, pts = points_for_edges(cfg.prefix, chunk) # sorgt für Edge-UUID & Dummy-Vector :contentReference[oaicite:12]{index=12} upsert_batch(client, col, pts) # 2) Backlinks (nur references) for chunk in _batched(backlinks, args.batch): col, pts = points_for_edges(cfg.prefix, chunk) upsert_batch(client, col, pts) if __name__ == "__main__": main()