209 lines
8.1 KiB
Python
209 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
||
# scripts/resolve_unresolved_references.py
|
||
#
|
||
# Zweck
|
||
# -----
|
||
# Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references")
|
||
# indem es:
|
||
# 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut,
|
||
# 2) pro Notiz die Wikilinks im Volltext neu auswertet,
|
||
# 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet,
|
||
# 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht.
|
||
#
|
||
# Aufrufparameter
|
||
# ---------------
|
||
# --vault /pfad/zum/vault (Erforderlich)
|
||
# --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung)
|
||
# --prefix <name> (Optional) Override COLLECTION_PREFIX
|
||
#
|
||
# Hinweise
|
||
# --------
|
||
# - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`).
|
||
# - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug).
|
||
# - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer.
|
||
# - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.).
|
||
#
|
||
# Version
|
||
# -------
|
||
# v1.0.0 (2025-09-05)
|
||
# - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved.
|
||
#
|
||
# Änderungshinweise
|
||
# -----------------
|
||
# - Keine Vorgängerversion (neu).
|
||
#
|
||
|
||
from __future__ import annotations
|
||
import argparse, glob, json, os, sys
|
||
from typing import List, Tuple, Dict
|
||
|
||
from dotenv import load_dotenv
|
||
from qdrant_client.http import models as rest
|
||
|
||
from app.core.parser import read_markdown
|
||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names
|
||
from app.core.qdrant_points import points_for_edges, upsert_batch
|
||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
||
|
||
# ---- helpers ----
|
||
|
||
def _coerce_parsed(p):
|
||
"""Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple."""
|
||
if hasattr(p, "frontmatter") and hasattr(p, "body"):
|
||
fm = dict(p.frontmatter or {})
|
||
body = p.body or ""
|
||
path = getattr(p, "path", None)
|
||
return fm, body, path
|
||
if isinstance(p, (list, tuple)) and len(p) >= 2:
|
||
fm = dict(p[0] or {})
|
||
body = p[1] or ""
|
||
return fm, body, None
|
||
raise TypeError("Unsupported return type from read_markdown")
|
||
|
||
def _slugify_filename(path: str) -> str:
|
||
base = os.path.basename(path).rsplit(".", 1)[0]
|
||
return base
|
||
|
||
def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]:
|
||
files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)]
|
||
out: List[Dict] = []
|
||
for abs_path in files:
|
||
if any(ex in abs_path.replace("\\","/") for ex in excludes):
|
||
continue
|
||
parsed = read_markdown(abs_path)
|
||
fm, body, p = _coerce_parsed(parsed)
|
||
note_id = fm.get("id") or fm.get("note_id")
|
||
if not note_id:
|
||
continue
|
||
rel = p if p else os.path.relpath(abs_path, vault)
|
||
out.append({
|
||
"note_id": note_id,
|
||
"title": fm.get("title") or _slugify_filename(rel),
|
||
"path": rel.replace("\\","/"),
|
||
"fulltext": body,
|
||
})
|
||
return out
|
||
|
||
def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]:
|
||
keep = []
|
||
for e in edges:
|
||
k = e.get("kind")
|
||
if k in ("references", "backlink"):
|
||
# Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#')
|
||
# Alles andere unverändert lassen.
|
||
keep.append(e)
|
||
return keep
|
||
|
||
def unique_edges(edges: List[dict]) -> List[dict]:
|
||
seen = set()
|
||
out = []
|
||
for e in edges:
|
||
k = e.get("kind","edge")
|
||
s = e.get("source_id","")
|
||
t = e.get("target_id","")
|
||
seq = e.get("seq","")
|
||
key = (k,s,t,seq)
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
out.append(e)
|
||
return out
|
||
|
||
def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None:
|
||
"""
|
||
Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht:
|
||
- kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets
|
||
- kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets
|
||
"""
|
||
if not raw_targets:
|
||
return
|
||
# references (note-level)
|
||
f1 = rest.Filter(
|
||
must=[
|
||
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")),
|
||
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
|
||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
|
||
]
|
||
)
|
||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True)
|
||
|
||
# references_at (chunk-level) – optionales Aufräumen
|
||
f2 = rest.Filter(
|
||
must=[
|
||
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")),
|
||
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match
|
||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
|
||
]
|
||
)
|
||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True)
|
||
|
||
def main():
|
||
load_dotenv()
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault")
|
||
ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)")
|
||
ap.add_argument("--prefix", help="Override COLLECTION_PREFIX")
|
||
args = ap.parse_args()
|
||
|
||
cfg = QdrantConfig.from_env()
|
||
if args.prefix:
|
||
cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim)
|
||
client = get_client(cfg)
|
||
ensure_collections(client, cfg.prefix, cfg.dim)
|
||
_, _, edges_col = collection_names(cfg.prefix)
|
||
|
||
# 1) Stubs sammeln und Index bauen
|
||
notes = iter_note_stubs(args.vault)
|
||
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug)
|
||
|
||
# 2) pro Note: Links im Volltext analysieren
|
||
upserts_total = 0
|
||
deletes_total = 0
|
||
details = []
|
||
for n in notes:
|
||
edges = derive_wikilink_edges(n, [], idx)
|
||
# nur references/backlink behalten
|
||
edges = filter_only_refs_and_backlinks(edges)
|
||
edges = unique_edges(edges)
|
||
|
||
# Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden)
|
||
raw_targets = []
|
||
for e in edges:
|
||
if e.get("kind") == "references":
|
||
# resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved"
|
||
# Für Delete brauchen wir aber die alten raw labels
|
||
raw_targets.append(e.get("raw") or e.get("target_label") or "")
|
||
|
||
# 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen)
|
||
if args.apply:
|
||
if raw_targets:
|
||
before = 0
|
||
delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r}))
|
||
# (Qdrant gibt hier keine Count-Rückgabe – wir zählen pessimistisch nicht.)
|
||
deletes_total += 1 # Marker pro Note mit Löschung
|
||
# 3b) Upsert der „richtigen“ references + backlink
|
||
if args.apply:
|
||
col, pts = points_for_edges(cfg.prefix, edges)
|
||
upsert_batch(client, col, pts)
|
||
upserts_total += len(pts)
|
||
|
||
details.append({
|
||
"note_id": n["note_id"],
|
||
"new_refs": sum(1 for e in edges if e["kind"]=="references"),
|
||
"new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"),
|
||
})
|
||
|
||
print(json.dumps({
|
||
"apply": bool(args.apply),
|
||
"notes_scanned": len(notes),
|
||
"edge_upserts": upserts_total,
|
||
"notes_with_unresolved_cleanup": deletes_total,
|
||
"prefix": cfg.prefix,
|
||
"summary_sample": details[:5]
|
||
}, ensure_ascii=False))
|
||
|
||
if __name__ == "__main__":
|
||
main()
|