#!/usr/bin/env python3 # scripts/resolve_unresolved_references.py # # Zweck # ----- # Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references") # indem es: # 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut, # 2) pro Notiz die Wikilinks im Volltext neu auswertet, # 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet, # 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht. # # Aufrufparameter # --------------- # --vault /pfad/zum/vault (Erforderlich) # --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung) # --prefix (Optional) Override COLLECTION_PREFIX # # Hinweise # -------- # - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`). # - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug). # - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer. # - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.). # # Version # ------- # v1.0.0 (2025-09-05) # - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved. # # Änderungshinweise # ----------------- # - Keine Vorgängerversion (neu). # from __future__ import annotations import argparse, glob, json, os, sys from typing import List, Tuple, Dict from dotenv import load_dotenv from qdrant_client.http import models as rest from app.core.parser import read_markdown from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names from app.core.qdrant_points import points_for_edges, upsert_batch from app.core.derive_edges import build_note_index, derive_wikilink_edges # ---- helpers ---- def _coerce_parsed(p): """Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple.""" if hasattr(p, "frontmatter") and hasattr(p, "body"): fm = dict(p.frontmatter or {}) body = p.body or "" path = getattr(p, "path", None) return fm, body, path if isinstance(p, (list, tuple)) and len(p) >= 2: fm = dict(p[0] or {}) body = p[1] or "" return fm, body, None raise TypeError("Unsupported return type from read_markdown") def _slugify_filename(path: str) -> str: base = os.path.basename(path).rsplit(".", 1)[0] return base def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]: files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)] out: List[Dict] = [] for abs_path in files: if any(ex in abs_path.replace("\\","/") for ex in excludes): continue parsed = read_markdown(abs_path) fm, body, p = _coerce_parsed(parsed) note_id = fm.get("id") or fm.get("note_id") if not note_id: continue rel = p if p else os.path.relpath(abs_path, vault) out.append({ "note_id": note_id, "title": fm.get("title") or _slugify_filename(rel), "path": rel.replace("\\","/"), "fulltext": body, }) return out def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]: keep = [] for e in edges: k = e.get("kind") if k in ("references", "backlink"): # Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#') # Alles andere unverändert lassen. keep.append(e) return keep def unique_edges(edges: List[dict]) -> List[dict]: seen = set() out = [] for e in edges: k = e.get("kind","edge") s = e.get("source_id","") t = e.get("target_id","") seq = e.get("seq","") key = (k,s,t,seq) if key in seen: continue seen.add(key) out.append(e) return out def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None: """ Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht: - kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets - kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets """ if not raw_targets: return # references (note-level) f1 = rest.Filter( must=[ rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")), rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)), rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)), ] ) client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True) # references_at (chunk-level) – optionales Aufräumen f2 = rest.Filter( must=[ rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")), rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")), rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)), ] ) client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True) def main(): load_dotenv() ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault") ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)") ap.add_argument("--prefix", help="Override COLLECTION_PREFIX") args = ap.parse_args() cfg = QdrantConfig.from_env() if args.prefix: cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim) client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) _, _, edges_col = collection_names(cfg.prefix) # 1) Stubs sammeln und Index bauen notes = iter_note_stubs(args.vault) idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) # 2) pro Note: Links im Volltext analysieren upserts_total = 0 deletes_total = 0 details = [] for n in notes: edges = derive_wikilink_edges(n, [], idx) # nur references/backlink behalten edges = filter_only_refs_and_backlinks(edges) edges = unique_edges(edges) # Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden) raw_targets = [] for e in edges: if e.get("kind") == "references": # resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved" # Für Delete brauchen wir aber die alten raw labels raw_targets.append(e.get("raw") or e.get("target_label") or "") # 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen) if args.apply: if raw_targets: before = 0 delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r})) # (Qdrant gibt hier keine Count-Rückgabe – wir zählen pessimistisch nicht.) deletes_total += 1 # Marker pro Note mit Löschung # 3b) Upsert der „richtigen“ references + backlink if args.apply: col, pts = points_for_edges(cfg.prefix, edges) upsert_batch(client, col, pts) upserts_total += len(pts) details.append({ "note_id": n["note_id"], "new_refs": sum(1 for e in edges if e["kind"]=="references"), "new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"), }) print(json.dumps({ "apply": bool(args.apply), "notes_scanned": len(notes), "edge_upserts": upserts_total, "notes_with_unresolved_cleanup": deletes_total, "prefix": cfg.prefix, "summary_sample": details[:5] }, ensure_ascii=False)) if __name__ == "__main__": main()