From df332936210fe806260fd43323ebcfc683cd2e68 Mon Sep 17 00:00:00 2001 From: Lars Date: Fri, 5 Sep 2025 19:16:27 +0200 Subject: [PATCH] scripts/resolve_unresolved_references.py aktualisiert --- scripts/resolve_unresolved_references.py | 383 +++++++++++++---------- 1 file changed, 209 insertions(+), 174 deletions(-) diff --git a/scripts/resolve_unresolved_references.py b/scripts/resolve_unresolved_references.py index c86c065..970f336 100644 --- a/scripts/resolve_unresolved_references.py +++ b/scripts/resolve_unresolved_references.py @@ -1,208 +1,243 @@ #!/usr/bin/env python3 # scripts/resolve_unresolved_references.py -# -# Zweck -# ----- -# Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references") -# indem es: -# 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut, -# 2) pro Notiz die Wikilinks im Volltext neu auswertet, -# 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet, -# 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht. -# -# Aufrufparameter -# --------------- -# --vault /pfad/zum/vault (Erforderlich) -# --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung) -# --prefix (Optional) Override COLLECTION_PREFIX -# -# Hinweise -# -------- -# - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`). -# - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug). -# - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer. -# - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.). -# -# Version -# ------- -# v1.0.0 (2025-09-05) -# - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved. -# -# Änderungshinweise -# ----------------- -# - Keine Vorgängerversion (neu). -# +""" +resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen +Version: 1.0.0 (2025-09-05) + +Zweck +------ +- Findet Edges in {prefix}_edges mit payload.status=="unresolved" und versucht, den Zielknoten + anhand bereits vorhandener Notes in {prefix}_notes aufzulösen. +- Aktualisiert die Edges (setzt target_id, entfernt status, setzt resolution), und erzeugt + – NUR für Note-Level 'references' – die symmetrische 'backlink'-Kante. + +Warum? +------ +- Beim ersten Import können Links auf (noch) nicht existierende Notizen zeigen. +- Sobald die Zielnotiz später existiert, kann dieses Skript die Kanten reparieren. + +Aufruf +------ + # Dry-Run (Standard): + python3 -m scripts.resolve_unresolved_references --prefix mindnet + + # Anwenden: + python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply + + # Optional: nur X Edges anfassen + python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply --limit 500 + +Parameter +--------- +--prefix : Collection-Prefix (Default: aus Env COLLECION_PREFIX oder "mindnet") +--apply : Änderungen tatsächlich schreiben (ohne --apply = Dry-Run) +--limit : Max. Anzahl unaufgelöster Edges, die in diesem Lauf bearbeitet werden (Default: keine Begrenzung) +--batch : Upsert-Batchgröße (Default: 512) + +Voraussetzungen / Hinweise +-------------------------- +- Bitte im aktivierten venv laufen lassen (deine Umgebung: `.venv`). +- Qdrant-URL/Key/Prefix/Vektor-Dim werden wie üblich aus ENV gelesen (sieh app/core/qdrant.py). # noqa +- Nutzt die vorhandenen Utilities: + - app/core/qdrant.py (Client/Collections) + - app/core/qdrant_points.py (points_for_edges/upsert_batch) + - app/core/derive_edges.py (build_note_index/resolve_target) + +Änderungshistorie +----------------- +1.0.0 Erstveröffentlichung. +""" from __future__ import annotations -import argparse, glob, json, os, sys -from typing import List, Tuple, Dict -from dotenv import load_dotenv +import argparse +import json +from typing import Any, Dict, List, Tuple, Iterable + +from qdrant_client import QdrantClient from qdrant_client.http import models as rest -from app.core.parser import read_markdown -from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names -from app.core.qdrant_points import points_for_edges, upsert_batch -from app.core.derive_edges import build_note_index, derive_wikilink_edges +from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names # :contentReference[oaicite:3]{index=3} +from app.core.qdrant_points import points_for_edges, upsert_batch # :contentReference[oaicite:4]{index=4} +from app.core.derive_edges import build_note_index, resolve_target # :contentReference[oaicite:5]{index=5} -# ---- helpers ---- -def _coerce_parsed(p): - """Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple.""" - if hasattr(p, "frontmatter") and hasattr(p, "body"): - fm = dict(p.frontmatter or {}) - body = p.body or "" - path = getattr(p, "path", None) - return fm, body, path - if isinstance(p, (list, tuple)) and len(p) >= 2: - fm = dict(p[0] or {}) - body = p[1] or "" - return fm, body, None - raise TypeError("Unsupported return type from read_markdown") - -def _slugify_filename(path: str) -> str: - base = os.path.basename(path).rsplit(".", 1)[0] - return base - -def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]: - files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)] - out: List[Dict] = [] - for abs_path in files: - if any(ex in abs_path.replace("\\","/") for ex in excludes): - continue - parsed = read_markdown(abs_path) - fm, body, p = _coerce_parsed(parsed) - note_id = fm.get("id") or fm.get("note_id") - if not note_id: - continue - rel = p if p else os.path.relpath(abs_path, vault) - out.append({ - "note_id": note_id, - "title": fm.get("title") or _slugify_filename(rel), - "path": rel.replace("\\","/"), - "fulltext": body, - }) - return out - -def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]: - keep = [] - for e in edges: - k = e.get("kind") - if k in ("references", "backlink"): - # Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#') - # Alles andere unverändert lassen. - keep.append(e) - return keep - -def unique_edges(edges: List[dict]) -> List[dict]: - seen = set() - out = [] - for e in edges: - k = e.get("kind","edge") - s = e.get("source_id","") - t = e.get("target_id","") - seq = e.get("seq","") - key = (k,s,t,seq) - if key in seen: - continue - seen.add(key) - out.append(e) - return out - -def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None: +def _scroll(client: QdrantClient, **kwargs): """ - Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht: - - kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets - - kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets + Wrapper um qdrant_client.scroll() für unterschiedliche Client-Versionen: + neuere: (points, next_offset) + ältere: (points, next_page_offset, _) """ - if not raw_targets: - return - # references (note-level) - f1 = rest.Filter( + res = client.scroll(**kwargs) + if isinstance(res, tuple): + if len(res) == 2: + points, next_off = res + else: + # ältere Signatur: (points, next_off, _) + points, next_off, _ = res[0], res[1], res[2] + else: + # sehr alte Clients -> konservativ behandeln + points, next_off = res, None + return points, next_off + + +def _load_all_notes(client: QdrantClient, notes_col: str) -> List[Dict[str, Any]]: + notes: List[Dict[str, Any]] = [] + next_off = None + while True: + pts, next_off = _scroll( + client, + collection_name=notes_col, + with_payload=True, + with_vectors=False, + limit=1024, + offset=next_off, + ) + for p in pts or []: + pl = getattr(p, "payload", {}) or {} + # Erwartet Felder: note_id, title, path etc. (gemäß Schema) # :contentReference[oaicite:6]{index=6} + if pl.get("note_id"): + notes.append(pl) + if not next_off: + break + return notes + + +def _iter_unresolved_edges(client: QdrantClient, edges_col: str) -> Iterable[rest.Record]: + """ + Liefert alle Edge-Records mit payload.status == 'unresolved' und 'target_label' (string). + """ + f = rest.Filter( must=[ - rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")), - rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)), rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), - rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)), ] ) - client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True) + next_off = None + while True: + pts, next_off = _scroll( + client, + collection_name=edges_col, + scroll_filter=f, + with_payload=True, + with_vectors=False, + limit=1024, + offset=next_off, + ) + for p in pts or []: + pl = getattr(p, "payload", {}) or {} + if isinstance(pl.get("target_label"), str): + yield p + if not next_off: + break + + +def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]: + """ + Baue eine 'backlink'-Edge-Payload source <- target (note-level). + """ + e = { + "kind": "backlink", + "source_id": target_note_id, + "target_id": source_note_id, + } + # Metafelder aus dem Original übernehmen (ohne status) + copy_keys = ["raw", "alias", "heading", "resolution"] + for k in copy_keys: + if k in extra: + e[k] = extra[k] + return e - # references_at (chunk-level) – optionales Aufräumen - f2 = rest.Filter( - must=[ - rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")), - rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match - rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), - rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)), - ] - ) - client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True) def main(): - load_dotenv() ap = argparse.ArgumentParser() - ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault") - ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)") - ap.add_argument("--prefix", help="Override COLLECTION_PREFIX") + ap.add_argument("--prefix", help="Collection-Prefix (Default: Env/COLLECTION_PREFIX oder 'mindnet')") + ap.add_argument("--apply", action="store_true", help="Änderungen schreiben (ohne Flag = Dry-Run)") + ap.add_argument("--limit", type=int, default=0, help="Max. Anzahl unaufgelöster Edges bearbeiten (0 = alle)") + ap.add_argument("--batch", type=int, default=512, help="Upsert-Batchgröße") args = ap.parse_args() + # Qdrant-Setup cfg = QdrantConfig.from_env() if args.prefix: - cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim) + cfg.prefix = args.prefix client = get_client(cfg) - ensure_collections(client, cfg.prefix, cfg.dim) - _, _, edges_col = collection_names(cfg.prefix) + ensure_collections(client, cfg.prefix, cfg.dim) # sorgt u. a. für 1D-Vektor-Collection bei Edges :contentReference[oaicite:7]{index=7} + notes_col, _, edges_col = collection_names(cfg.prefix) # :contentReference[oaicite:8]{index=8} - # 1) Stubs sammeln und Index bauen - notes = iter_note_stubs(args.vault) - idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) + # Notes laden & Index bauen + notes = _load_all_notes(client, notes_col) + idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) :contentReference[oaicite:9]{index=9} - # 2) pro Note: Links im Volltext analysieren - upserts_total = 0 - deletes_total = 0 - details = [] - for n in notes: - edges = derive_wikilink_edges(n, [], idx) - # nur references/backlink behalten - edges = filter_only_refs_and_backlinks(edges) - edges = unique_edges(edges) + # Unresolved-Edges scannen + to_fix: List[dict] = [] + backlinks: List[dict] = [] + processed = 0 + resolved = 0 - # Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden) - raw_targets = [] - for e in edges: - if e.get("kind") == "references": - # resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved" - # Für Delete brauchen wir aber die alten raw labels - raw_targets.append(e.get("raw") or e.get("target_label") or "") + for rec in _iter_unresolved_edges(client, edges_col): + if args.limit and processed >= args.limit: + break + processed += 1 - # 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen) - if args.apply: - if raw_targets: - before = 0 - delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r})) - # (Qdrant gibt hier keine Count-Rückgabe – wir zählen pessimistisch nicht.) - deletes_total += 1 # Marker pro Note mit Löschung - # 3b) Upsert der „richtigen“ references + backlink - if args.apply: - col, pts = points_for_edges(cfg.prefix, edges) - upsert_batch(client, col, pts) - upserts_total += len(pts) + pl = dict(rec.payload or {}) + kind = pl.get("kind") or "references" + src = pl.get("source_id") + tgt_label = pl.get("target_label") or pl.get("target_id") # Fallback - details.append({ - "note_id": n["note_id"], - "new_refs": sum(1 for e in edges if e["kind"]=="references"), - "new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"), - }) + # Zielauflösung + resolved_id, how = resolve_target(str(tgt_label), idx) # :contentReference[oaicite:10]{index=10} + if not resolved_id: + continue # weiterhin unresolved - print(json.dumps({ - "apply": bool(args.apply), - "notes_scanned": len(notes), - "edge_upserts": upserts_total, - "notes_with_unresolved_cleanup": deletes_total, + # Edge-Update + new_pl = dict(pl) + new_pl["target_id"] = resolved_id + new_pl["resolution"] = how + if "status" in new_pl: + del new_pl["status"] + # ID stabil lassen -> points_for_edges erzeugt UUID aus edge_id/Fallback :contentReference[oaicite:11]{index=11} + if "edge_id" not in new_pl: + # stabiler Key aus (kind, src, tgt, evtl. seq) + seq = new_pl.get("seq") or new_pl.get("order") or "" + new_pl["edge_id"] = f"{kind}:{src}->{resolved_id}#{seq}" + + to_fix.append(new_pl) + resolved += 1 + + # Nur bei Note-Level references (nicht references_at) -> Backlink erzeugen + if kind == "references": + extra = {k: new_pl.get(k) for k in ("raw", "alias", "heading")} + extra["resolution"] = how + backlinks.append(_make_backlink(source_note_id=src, target_note_id=resolved_id, extra=extra)) + + # Ergebnis ausgeben + summary = { "prefix": cfg.prefix, - "summary_sample": details[:5] - }, ensure_ascii=False)) + "scanned_unresolved": processed, + "resolved": resolved, + "backlinks_to_create": len(backlinks), + "apply": bool(args.apply), + } + print(json.dumps(summary, ensure_ascii=False)) + + if not args.apply: + return + + # Upserts (in Batches) + def _batched(items: List[dict], n: int) -> Iterable[List[dict]]: + for i in range(0, len(items), n): + yield items[i : i + n] + + # 1) Updates für reparierte Edges + for chunk in _batched(to_fix, args.batch): + col, pts = points_for_edges(cfg.prefix, chunk) # sorgt für Edge-UUID & Dummy-Vector :contentReference[oaicite:12]{index=12} + upsert_batch(client, col, pts) + + # 2) Backlinks (nur references) + for chunk in _batched(backlinks, args.batch): + col, pts = points_for_edges(cfg.prefix, chunk) + upsert_batch(client, col, pts) + if __name__ == "__main__": main()