mindnet/scripts/resolve_unresolved_references.py
Lars df33293621
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
scripts/resolve_unresolved_references.py aktualisiert
2025-09-05 19:16:27 +02:00

244 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# scripts/resolve_unresolved_references.py
"""
resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen
Version: 1.0.0 (2025-09-05)
Zweck
------
- Findet Edges in {prefix}_edges mit payload.status=="unresolved" und versucht, den Zielknoten
anhand bereits vorhandener Notes in {prefix}_notes aufzulösen.
- Aktualisiert die Edges (setzt target_id, entfernt status, setzt resolution), und erzeugt
NUR für Note-Level 'references' die symmetrische 'backlink'-Kante.
Warum?
------
- Beim ersten Import können Links auf (noch) nicht existierende Notizen zeigen.
- Sobald die Zielnotiz später existiert, kann dieses Skript die Kanten reparieren.
Aufruf
------
# Dry-Run (Standard):
python3 -m scripts.resolve_unresolved_references --prefix mindnet
# Anwenden:
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply
# Optional: nur X Edges anfassen
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply --limit 500
Parameter
---------
--prefix : Collection-Prefix (Default: aus Env COLLECION_PREFIX oder "mindnet")
--apply : Änderungen tatsächlich schreiben (ohne --apply = Dry-Run)
--limit : Max. Anzahl unaufgelöster Edges, die in diesem Lauf bearbeitet werden (Default: keine Begrenzung)
--batch : Upsert-Batchgröße (Default: 512)
Voraussetzungen / Hinweise
--------------------------
- Bitte im aktivierten venv laufen lassen (deine Umgebung: `.venv`).
- Qdrant-URL/Key/Prefix/Vektor-Dim werden wie üblich aus ENV gelesen (sieh app/core/qdrant.py). # noqa
- Nutzt die vorhandenen Utilities:
- app/core/qdrant.py (Client/Collections)
- app/core/qdrant_points.py (points_for_edges/upsert_batch)
- app/core/derive_edges.py (build_note_index/resolve_target)
Änderungshistorie
-----------------
1.0.0 Erstveröffentlichung.
"""
from __future__ import annotations
import argparse
import json
from typing import Any, Dict, List, Tuple, Iterable
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names # :contentReference[oaicite:3]{index=3}
from app.core.qdrant_points import points_for_edges, upsert_batch # :contentReference[oaicite:4]{index=4}
from app.core.derive_edges import build_note_index, resolve_target # :contentReference[oaicite:5]{index=5}
def _scroll(client: QdrantClient, **kwargs):
"""
Wrapper um qdrant_client.scroll() für unterschiedliche Client-Versionen:
neuere: (points, next_offset)
ältere: (points, next_page_offset, _)
"""
res = client.scroll(**kwargs)
if isinstance(res, tuple):
if len(res) == 2:
points, next_off = res
else:
# ältere Signatur: (points, next_off, _)
points, next_off, _ = res[0], res[1], res[2]
else:
# sehr alte Clients -> konservativ behandeln
points, next_off = res, None
return points, next_off
def _load_all_notes(client: QdrantClient, notes_col: str) -> List[Dict[str, Any]]:
notes: List[Dict[str, Any]] = []
next_off = None
while True:
pts, next_off = _scroll(
client,
collection_name=notes_col,
with_payload=True,
with_vectors=False,
limit=1024,
offset=next_off,
)
for p in pts or []:
pl = getattr(p, "payload", {}) or {}
# Erwartet Felder: note_id, title, path etc. (gemäß Schema) # :contentReference[oaicite:6]{index=6}
if pl.get("note_id"):
notes.append(pl)
if not next_off:
break
return notes
def _iter_unresolved_edges(client: QdrantClient, edges_col: str) -> Iterable[rest.Record]:
"""
Liefert alle Edge-Records mit payload.status == 'unresolved' und 'target_label' (string).
"""
f = rest.Filter(
must=[
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
]
)
next_off = None
while True:
pts, next_off = _scroll(
client,
collection_name=edges_col,
scroll_filter=f,
with_payload=True,
with_vectors=False,
limit=1024,
offset=next_off,
)
for p in pts or []:
pl = getattr(p, "payload", {}) or {}
if isinstance(pl.get("target_label"), str):
yield p
if not next_off:
break
def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]:
"""
Baue eine 'backlink'-Edge-Payload source <- target (note-level).
"""
e = {
"kind": "backlink",
"source_id": target_note_id,
"target_id": source_note_id,
}
# Metafelder aus dem Original übernehmen (ohne status)
copy_keys = ["raw", "alias", "heading", "resolution"]
for k in copy_keys:
if k in extra:
e[k] = extra[k]
return e
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--prefix", help="Collection-Prefix (Default: Env/COLLECTION_PREFIX oder 'mindnet')")
ap.add_argument("--apply", action="store_true", help="Änderungen schreiben (ohne Flag = Dry-Run)")
ap.add_argument("--limit", type=int, default=0, help="Max. Anzahl unaufgelöster Edges bearbeiten (0 = alle)")
ap.add_argument("--batch", type=int, default=512, help="Upsert-Batchgröße")
args = ap.parse_args()
# Qdrant-Setup
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix
client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) # sorgt u. a. für 1D-Vektor-Collection bei Edges :contentReference[oaicite:7]{index=7}
notes_col, _, edges_col = collection_names(cfg.prefix) # :contentReference[oaicite:8]{index=8}
# Notes laden & Index bauen
notes = _load_all_notes(client, notes_col)
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) :contentReference[oaicite:9]{index=9}
# Unresolved-Edges scannen
to_fix: List[dict] = []
backlinks: List[dict] = []
processed = 0
resolved = 0
for rec in _iter_unresolved_edges(client, edges_col):
if args.limit and processed >= args.limit:
break
processed += 1
pl = dict(rec.payload or {})
kind = pl.get("kind") or "references"
src = pl.get("source_id")
tgt_label = pl.get("target_label") or pl.get("target_id") # Fallback
# Zielauflösung
resolved_id, how = resolve_target(str(tgt_label), idx) # :contentReference[oaicite:10]{index=10}
if not resolved_id:
continue # weiterhin unresolved
# Edge-Update
new_pl = dict(pl)
new_pl["target_id"] = resolved_id
new_pl["resolution"] = how
if "status" in new_pl:
del new_pl["status"]
# ID stabil lassen -> points_for_edges erzeugt UUID aus edge_id/Fallback :contentReference[oaicite:11]{index=11}
if "edge_id" not in new_pl:
# stabiler Key aus (kind, src, tgt, evtl. seq)
seq = new_pl.get("seq") or new_pl.get("order") or ""
new_pl["edge_id"] = f"{kind}:{src}->{resolved_id}#{seq}"
to_fix.append(new_pl)
resolved += 1
# Nur bei Note-Level references (nicht references_at) -> Backlink erzeugen
if kind == "references":
extra = {k: new_pl.get(k) for k in ("raw", "alias", "heading")}
extra["resolution"] = how
backlinks.append(_make_backlink(source_note_id=src, target_note_id=resolved_id, extra=extra))
# Ergebnis ausgeben
summary = {
"prefix": cfg.prefix,
"scanned_unresolved": processed,
"resolved": resolved,
"backlinks_to_create": len(backlinks),
"apply": bool(args.apply),
}
print(json.dumps(summary, ensure_ascii=False))
if not args.apply:
return
# Upserts (in Batches)
def _batched(items: List[dict], n: int) -> Iterable[List[dict]]:
for i in range(0, len(items), n):
yield items[i : i + n]
# 1) Updates für reparierte Edges
for chunk in _batched(to_fix, args.batch):
col, pts = points_for_edges(cfg.prefix, chunk) # sorgt für Edge-UUID & Dummy-Vector :contentReference[oaicite:12]{index=12}
upsert_batch(client, col, pts)
# 2) Backlinks (nur references)
for chunk in _batched(backlinks, args.batch):
col, pts = points_for_edges(cfg.prefix, chunk)
upsert_batch(client, col, pts)
if __name__ == "__main__":
main()