244 lines
8.6 KiB
Python
244 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
||
# scripts/resolve_unresolved_references.py
|
||
"""
|
||
resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen
|
||
|
||
Version: 1.0.0 (2025-09-05)
|
||
|
||
Zweck
|
||
------
|
||
- Findet Edges in {prefix}_edges mit payload.status=="unresolved" und versucht, den Zielknoten
|
||
anhand bereits vorhandener Notes in {prefix}_notes aufzulösen.
|
||
- Aktualisiert die Edges (setzt target_id, entfernt status, setzt resolution), und erzeugt
|
||
– NUR für Note-Level 'references' – die symmetrische 'backlink'-Kante.
|
||
|
||
Warum?
|
||
------
|
||
- Beim ersten Import können Links auf (noch) nicht existierende Notizen zeigen.
|
||
- Sobald die Zielnotiz später existiert, kann dieses Skript die Kanten reparieren.
|
||
|
||
Aufruf
|
||
------
|
||
# Dry-Run (Standard):
|
||
python3 -m scripts.resolve_unresolved_references --prefix mindnet
|
||
|
||
# Anwenden:
|
||
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply
|
||
|
||
# Optional: nur X Edges anfassen
|
||
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply --limit 500
|
||
|
||
Parameter
|
||
---------
|
||
--prefix : Collection-Prefix (Default: aus Env COLLECION_PREFIX oder "mindnet")
|
||
--apply : Änderungen tatsächlich schreiben (ohne --apply = Dry-Run)
|
||
--limit : Max. Anzahl unaufgelöster Edges, die in diesem Lauf bearbeitet werden (Default: keine Begrenzung)
|
||
--batch : Upsert-Batchgröße (Default: 512)
|
||
|
||
Voraussetzungen / Hinweise
|
||
--------------------------
|
||
- Bitte im aktivierten venv laufen lassen (deine Umgebung: `.venv`).
|
||
- Qdrant-URL/Key/Prefix/Vektor-Dim werden wie üblich aus ENV gelesen (sieh app/core/qdrant.py). # noqa
|
||
- Nutzt die vorhandenen Utilities:
|
||
- app/core/qdrant.py (Client/Collections)
|
||
- app/core/qdrant_points.py (points_for_edges/upsert_batch)
|
||
- app/core/derive_edges.py (build_note_index/resolve_target)
|
||
|
||
Änderungshistorie
|
||
-----------------
|
||
1.0.0 Erstveröffentlichung.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
from typing import Any, Dict, List, Tuple, Iterable
|
||
|
||
from qdrant_client import QdrantClient
|
||
from qdrant_client.http import models as rest
|
||
|
||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names # :contentReference[oaicite:3]{index=3}
|
||
from app.core.qdrant_points import points_for_edges, upsert_batch # :contentReference[oaicite:4]{index=4}
|
||
from app.core.derive_edges import build_note_index, resolve_target # :contentReference[oaicite:5]{index=5}
|
||
|
||
|
||
def _scroll(client: QdrantClient, **kwargs):
|
||
"""
|
||
Wrapper um qdrant_client.scroll() für unterschiedliche Client-Versionen:
|
||
neuere: (points, next_offset)
|
||
ältere: (points, next_page_offset, _)
|
||
"""
|
||
res = client.scroll(**kwargs)
|
||
if isinstance(res, tuple):
|
||
if len(res) == 2:
|
||
points, next_off = res
|
||
else:
|
||
# ältere Signatur: (points, next_off, _)
|
||
points, next_off, _ = res[0], res[1], res[2]
|
||
else:
|
||
# sehr alte Clients -> konservativ behandeln
|
||
points, next_off = res, None
|
||
return points, next_off
|
||
|
||
|
||
def _load_all_notes(client: QdrantClient, notes_col: str) -> List[Dict[str, Any]]:
|
||
notes: List[Dict[str, Any]] = []
|
||
next_off = None
|
||
while True:
|
||
pts, next_off = _scroll(
|
||
client,
|
||
collection_name=notes_col,
|
||
with_payload=True,
|
||
with_vectors=False,
|
||
limit=1024,
|
||
offset=next_off,
|
||
)
|
||
for p in pts or []:
|
||
pl = getattr(p, "payload", {}) or {}
|
||
# Erwartet Felder: note_id, title, path etc. (gemäß Schema) # :contentReference[oaicite:6]{index=6}
|
||
if pl.get("note_id"):
|
||
notes.append(pl)
|
||
if not next_off:
|
||
break
|
||
return notes
|
||
|
||
|
||
def _iter_unresolved_edges(client: QdrantClient, edges_col: str) -> Iterable[rest.Record]:
|
||
"""
|
||
Liefert alle Edge-Records mit payload.status == 'unresolved' und 'target_label' (string).
|
||
"""
|
||
f = rest.Filter(
|
||
must=[
|
||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||
]
|
||
)
|
||
next_off = None
|
||
while True:
|
||
pts, next_off = _scroll(
|
||
client,
|
||
collection_name=edges_col,
|
||
scroll_filter=f,
|
||
with_payload=True,
|
||
with_vectors=False,
|
||
limit=1024,
|
||
offset=next_off,
|
||
)
|
||
for p in pts or []:
|
||
pl = getattr(p, "payload", {}) or {}
|
||
if isinstance(pl.get("target_label"), str):
|
||
yield p
|
||
if not next_off:
|
||
break
|
||
|
||
|
||
def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""
|
||
Baue eine 'backlink'-Edge-Payload source <- target (note-level).
|
||
"""
|
||
e = {
|
||
"kind": "backlink",
|
||
"source_id": target_note_id,
|
||
"target_id": source_note_id,
|
||
}
|
||
# Metafelder aus dem Original übernehmen (ohne status)
|
||
copy_keys = ["raw", "alias", "heading", "resolution"]
|
||
for k in copy_keys:
|
||
if k in extra:
|
||
e[k] = extra[k]
|
||
return e
|
||
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--prefix", help="Collection-Prefix (Default: Env/COLLECTION_PREFIX oder 'mindnet')")
|
||
ap.add_argument("--apply", action="store_true", help="Änderungen schreiben (ohne Flag = Dry-Run)")
|
||
ap.add_argument("--limit", type=int, default=0, help="Max. Anzahl unaufgelöster Edges bearbeiten (0 = alle)")
|
||
ap.add_argument("--batch", type=int, default=512, help="Upsert-Batchgröße")
|
||
args = ap.parse_args()
|
||
|
||
# Qdrant-Setup
|
||
cfg = QdrantConfig.from_env()
|
||
if args.prefix:
|
||
cfg.prefix = args.prefix
|
||
client = get_client(cfg)
|
||
ensure_collections(client, cfg.prefix, cfg.dim) # sorgt u. a. für 1D-Vektor-Collection bei Edges :contentReference[oaicite:7]{index=7}
|
||
notes_col, _, edges_col = collection_names(cfg.prefix) # :contentReference[oaicite:8]{index=8}
|
||
|
||
# Notes laden & Index bauen
|
||
notes = _load_all_notes(client, notes_col)
|
||
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) :contentReference[oaicite:9]{index=9}
|
||
|
||
# Unresolved-Edges scannen
|
||
to_fix: List[dict] = []
|
||
backlinks: List[dict] = []
|
||
processed = 0
|
||
resolved = 0
|
||
|
||
for rec in _iter_unresolved_edges(client, edges_col):
|
||
if args.limit and processed >= args.limit:
|
||
break
|
||
processed += 1
|
||
|
||
pl = dict(rec.payload or {})
|
||
kind = pl.get("kind") or "references"
|
||
src = pl.get("source_id")
|
||
tgt_label = pl.get("target_label") or pl.get("target_id") # Fallback
|
||
|
||
# Zielauflösung
|
||
resolved_id, how = resolve_target(str(tgt_label), idx) # :contentReference[oaicite:10]{index=10}
|
||
if not resolved_id:
|
||
continue # weiterhin unresolved
|
||
|
||
# Edge-Update
|
||
new_pl = dict(pl)
|
||
new_pl["target_id"] = resolved_id
|
||
new_pl["resolution"] = how
|
||
if "status" in new_pl:
|
||
del new_pl["status"]
|
||
# ID stabil lassen -> points_for_edges erzeugt UUID aus edge_id/Fallback :contentReference[oaicite:11]{index=11}
|
||
if "edge_id" not in new_pl:
|
||
# stabiler Key aus (kind, src, tgt, evtl. seq)
|
||
seq = new_pl.get("seq") or new_pl.get("order") or ""
|
||
new_pl["edge_id"] = f"{kind}:{src}->{resolved_id}#{seq}"
|
||
|
||
to_fix.append(new_pl)
|
||
resolved += 1
|
||
|
||
# Nur bei Note-Level references (nicht references_at) -> Backlink erzeugen
|
||
if kind == "references":
|
||
extra = {k: new_pl.get(k) for k in ("raw", "alias", "heading")}
|
||
extra["resolution"] = how
|
||
backlinks.append(_make_backlink(source_note_id=src, target_note_id=resolved_id, extra=extra))
|
||
|
||
# Ergebnis ausgeben
|
||
summary = {
|
||
"prefix": cfg.prefix,
|
||
"scanned_unresolved": processed,
|
||
"resolved": resolved,
|
||
"backlinks_to_create": len(backlinks),
|
||
"apply": bool(args.apply),
|
||
}
|
||
print(json.dumps(summary, ensure_ascii=False))
|
||
|
||
if not args.apply:
|
||
return
|
||
|
||
# Upserts (in Batches)
|
||
def _batched(items: List[dict], n: int) -> Iterable[List[dict]]:
|
||
for i in range(0, len(items), n):
|
||
yield items[i : i + n]
|
||
|
||
# 1) Updates für reparierte Edges
|
||
for chunk in _batched(to_fix, args.batch):
|
||
col, pts = points_for_edges(cfg.prefix, chunk) # sorgt für Edge-UUID & Dummy-Vector :contentReference[oaicite:12]{index=12}
|
||
upsert_batch(client, col, pts)
|
||
|
||
# 2) Backlinks (nur references)
|
||
for chunk in _batched(backlinks, args.batch):
|
||
col, pts = points_for_edges(cfg.prefix, chunk)
|
||
upsert_batch(client, col, pts)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|