diff --git a/scripts/resolve_unresolved_references.py b/scripts/resolve_unresolved_references.py new file mode 100644 index 0000000..c86c065 --- /dev/null +++ b/scripts/resolve_unresolved_references.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# scripts/resolve_unresolved_references.py +# +# Zweck +# ----- +# Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references") +# indem es: +# 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut, +# 2) pro Notiz die Wikilinks im Volltext neu auswertet, +# 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet, +# 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht. +# +# Aufrufparameter +# --------------- +# --vault /pfad/zum/vault (Erforderlich) +# --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung) +# --prefix (Optional) Override COLLECTION_PREFIX +# +# Hinweise +# -------- +# - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`). +# - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug). +# - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer. +# - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.). +# +# Version +# ------- +# v1.0.0 (2025-09-05) +# - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved. +# +# Änderungshinweise +# ----------------- +# - Keine Vorgängerversion (neu). +# + +from __future__ import annotations +import argparse, glob, json, os, sys +from typing import List, Tuple, Dict + +from dotenv import load_dotenv +from qdrant_client.http import models as rest + +from app.core.parser import read_markdown +from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names +from app.core.qdrant_points import points_for_edges, upsert_batch +from app.core.derive_edges import build_note_index, derive_wikilink_edges + +# ---- helpers ---- + +def _coerce_parsed(p): + """Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple.""" + if hasattr(p, "frontmatter") and hasattr(p, "body"): + fm = dict(p.frontmatter or {}) + body = p.body or "" + path = getattr(p, "path", None) + return fm, body, path + if isinstance(p, (list, tuple)) and len(p) >= 2: + fm = dict(p[0] or {}) + body = p[1] or "" + return fm, body, None + raise TypeError("Unsupported return type from read_markdown") + +def _slugify_filename(path: str) -> str: + base = os.path.basename(path).rsplit(".", 1)[0] + return base + +def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]: + files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)] + out: List[Dict] = [] + for abs_path in files: + if any(ex in abs_path.replace("\\","/") for ex in excludes): + continue + parsed = read_markdown(abs_path) + fm, body, p = _coerce_parsed(parsed) + note_id = fm.get("id") or fm.get("note_id") + if not note_id: + continue + rel = p if p else os.path.relpath(abs_path, vault) + out.append({ + "note_id": note_id, + "title": fm.get("title") or _slugify_filename(rel), + "path": rel.replace("\\","/"), + "fulltext": body, + }) + return out + +def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]: + keep = [] + for e in edges: + k = e.get("kind") + if k in ("references", "backlink"): + # Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#') + # Alles andere unverändert lassen. + keep.append(e) + return keep + +def unique_edges(edges: List[dict]) -> List[dict]: + seen = set() + out = [] + for e in edges: + k = e.get("kind","edge") + s = e.get("source_id","") + t = e.get("target_id","") + seq = e.get("seq","") + key = (k,s,t,seq) + if key in seen: + continue + seen.add(key) + out.append(e) + return out + +def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None: + """ + Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht: + - kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets + - kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets + """ + if not raw_targets: + return + # references (note-level) + f1 = rest.Filter( + must=[ + rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")), + rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)), + rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), + rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)), + ] + ) + client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True) + + # references_at (chunk-level) – optionales Aufräumen + f2 = rest.Filter( + must=[ + rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")), + rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match + rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), + rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)), + ] + ) + client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True) + +def main(): + load_dotenv() + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault") + ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)") + ap.add_argument("--prefix", help="Override COLLECTION_PREFIX") + args = ap.parse_args() + + cfg = QdrantConfig.from_env() + if args.prefix: + cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim) + client = get_client(cfg) + ensure_collections(client, cfg.prefix, cfg.dim) + _, _, edges_col = collection_names(cfg.prefix) + + # 1) Stubs sammeln und Index bauen + notes = iter_note_stubs(args.vault) + idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) + + # 2) pro Note: Links im Volltext analysieren + upserts_total = 0 + deletes_total = 0 + details = [] + for n in notes: + edges = derive_wikilink_edges(n, [], idx) + # nur references/backlink behalten + edges = filter_only_refs_and_backlinks(edges) + edges = unique_edges(edges) + + # Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden) + raw_targets = [] + for e in edges: + if e.get("kind") == "references": + # resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved" + # Für Delete brauchen wir aber die alten raw labels + raw_targets.append(e.get("raw") or e.get("target_label") or "") + + # 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen) + if args.apply: + if raw_targets: + before = 0 + delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r})) + # (Qdrant gibt hier keine Count-Rückgabe – wir zählen pessimistisch nicht.) + deletes_total += 1 # Marker pro Note mit Löschung + # 3b) Upsert der „richtigen“ references + backlink + if args.apply: + col, pts = points_for_edges(cfg.prefix, edges) + upsert_batch(client, col, pts) + upserts_total += len(pts) + + details.append({ + "note_id": n["note_id"], + "new_refs": sum(1 for e in edges if e["kind"]=="references"), + "new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"), + }) + + print(json.dumps({ + "apply": bool(args.apply), + "notes_scanned": len(notes), + "edge_upserts": upserts_total, + "notes_with_unresolved_cleanup": deletes_total, + "prefix": cfg.prefix, + "summary_sample": details[:5] + }, ensure_ascii=False)) + +if __name__ == "__main__": + main()