mindnet/scripts/resolve_unresolved_references.py
Lars 5478be60c1
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
scripts/resolve_unresolved_references.py hinzugefügt
2025-09-05 12:37:59 +02:00

209 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# scripts/resolve_unresolved_references.py
#
# Zweck
# -----
# Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references")
# indem es:
# 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut,
# 2) pro Notiz die Wikilinks im Volltext neu auswertet,
# 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet,
# 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht.
#
# Aufrufparameter
# ---------------
# --vault /pfad/zum/vault (Erforderlich)
# --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung)
# --prefix <name> (Optional) Override COLLECTION_PREFIX
#
# Hinweise
# --------
# - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`).
# - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug).
# - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer.
# - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.).
#
# Version
# -------
# v1.0.0 (2025-09-05)
# - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved.
#
# Änderungshinweise
# -----------------
# - Keine Vorgängerversion (neu).
#
from __future__ import annotations
import argparse, glob, json, os, sys
from typing import List, Tuple, Dict
from dotenv import load_dotenv
from qdrant_client.http import models as rest
from app.core.parser import read_markdown
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names
from app.core.qdrant_points import points_for_edges, upsert_batch
from app.core.derive_edges import build_note_index, derive_wikilink_edges
# ---- helpers ----
def _coerce_parsed(p):
"""Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple."""
if hasattr(p, "frontmatter") and hasattr(p, "body"):
fm = dict(p.frontmatter or {})
body = p.body or ""
path = getattr(p, "path", None)
return fm, body, path
if isinstance(p, (list, tuple)) and len(p) >= 2:
fm = dict(p[0] or {})
body = p[1] or ""
return fm, body, None
raise TypeError("Unsupported return type from read_markdown")
def _slugify_filename(path: str) -> str:
base = os.path.basename(path).rsplit(".", 1)[0]
return base
def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]:
files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)]
out: List[Dict] = []
for abs_path in files:
if any(ex in abs_path.replace("\\","/") for ex in excludes):
continue
parsed = read_markdown(abs_path)
fm, body, p = _coerce_parsed(parsed)
note_id = fm.get("id") or fm.get("note_id")
if not note_id:
continue
rel = p if p else os.path.relpath(abs_path, vault)
out.append({
"note_id": note_id,
"title": fm.get("title") or _slugify_filename(rel),
"path": rel.replace("\\","/"),
"fulltext": body,
})
return out
def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]:
keep = []
for e in edges:
k = e.get("kind")
if k in ("references", "backlink"):
# Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#')
# Alles andere unverändert lassen.
keep.append(e)
return keep
def unique_edges(edges: List[dict]) -> List[dict]:
seen = set()
out = []
for e in edges:
k = e.get("kind","edge")
s = e.get("source_id","")
t = e.get("target_id","")
seq = e.get("seq","")
key = (k,s,t,seq)
if key in seen:
continue
seen.add(key)
out.append(e)
return out
def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None:
"""
Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht:
- kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets
- kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets
"""
if not raw_targets:
return
# references (note-level)
f1 = rest.Filter(
must=[
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")),
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
]
)
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True)
# references_at (chunk-level) optionales Aufräumen
f2 = rest.Filter(
must=[
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")),
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
]
)
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True)
def main():
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault")
ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)")
ap.add_argument("--prefix", help="Override COLLECTION_PREFIX")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
if args.prefix:
cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim)
client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim)
_, _, edges_col = collection_names(cfg.prefix)
# 1) Stubs sammeln und Index bauen
notes = iter_note_stubs(args.vault)
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug)
# 2) pro Note: Links im Volltext analysieren
upserts_total = 0
deletes_total = 0
details = []
for n in notes:
edges = derive_wikilink_edges(n, [], idx)
# nur references/backlink behalten
edges = filter_only_refs_and_backlinks(edges)
edges = unique_edges(edges)
# Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden)
raw_targets = []
for e in edges:
if e.get("kind") == "references":
# resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved"
# Für Delete brauchen wir aber die alten raw labels
raw_targets.append(e.get("raw") or e.get("target_label") or "")
# 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen)
if args.apply:
if raw_targets:
before = 0
delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r}))
# (Qdrant gibt hier keine Count-Rückgabe wir zählen pessimistisch nicht.)
deletes_total += 1 # Marker pro Note mit Löschung
# 3b) Upsert der „richtigen“ references + backlink
if args.apply:
col, pts = points_for_edges(cfg.prefix, edges)
upsert_batch(client, col, pts)
upserts_total += len(pts)
details.append({
"note_id": n["note_id"],
"new_refs": sum(1 for e in edges if e["kind"]=="references"),
"new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"),
})
print(json.dumps({
"apply": bool(args.apply),
"notes_scanned": len(notes),
"edge_upserts": upserts_total,
"notes_with_unresolved_cleanup": deletes_total,
"prefix": cfg.prefix,
"summary_sample": details[:5]
}, ensure_ascii=False))
if __name__ == "__main__":
main()