scripts/resolve_unresolved_references.py hinzugefügt
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
3deb99f053
commit
5478be60c1
208
scripts/resolve_unresolved_references.py
Normal file
208
scripts/resolve_unresolved_references.py
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
#!/usr/bin/env python3
|
||||
# scripts/resolve_unresolved_references.py
|
||||
#
|
||||
# Zweck
|
||||
# -----
|
||||
# Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references")
|
||||
# indem es:
|
||||
# 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut,
|
||||
# 2) pro Notiz die Wikilinks im Volltext neu auswertet,
|
||||
# 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet,
|
||||
# 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht.
|
||||
#
|
||||
# Aufrufparameter
|
||||
# ---------------
|
||||
# --vault /pfad/zum/vault (Erforderlich)
|
||||
# --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung)
|
||||
# --prefix <name> (Optional) Override COLLECTION_PREFIX
|
||||
#
|
||||
# Hinweise
|
||||
# --------
|
||||
# - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`).
|
||||
# - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug).
|
||||
# - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer.
|
||||
# - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.).
|
||||
#
|
||||
# Version
|
||||
# -------
|
||||
# v1.0.0 (2025-09-05)
|
||||
# - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved.
|
||||
#
|
||||
# Änderungshinweise
|
||||
# -----------------
|
||||
# - Keine Vorgängerversion (neu).
|
||||
#
|
||||
|
||||
from __future__ import annotations
|
||||
import argparse, glob, json, os, sys
|
||||
from typing import List, Tuple, Dict
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
from app.core.parser import read_markdown
|
||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names
|
||||
from app.core.qdrant_points import points_for_edges, upsert_batch
|
||||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
||||
|
||||
# ---- helpers ----
|
||||
|
||||
def _coerce_parsed(p):
|
||||
"""Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple."""
|
||||
if hasattr(p, "frontmatter") and hasattr(p, "body"):
|
||||
fm = dict(p.frontmatter or {})
|
||||
body = p.body or ""
|
||||
path = getattr(p, "path", None)
|
||||
return fm, body, path
|
||||
if isinstance(p, (list, tuple)) and len(p) >= 2:
|
||||
fm = dict(p[0] or {})
|
||||
body = p[1] or ""
|
||||
return fm, body, None
|
||||
raise TypeError("Unsupported return type from read_markdown")
|
||||
|
||||
def _slugify_filename(path: str) -> str:
|
||||
base = os.path.basename(path).rsplit(".", 1)[0]
|
||||
return base
|
||||
|
||||
def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]:
|
||||
files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)]
|
||||
out: List[Dict] = []
|
||||
for abs_path in files:
|
||||
if any(ex in abs_path.replace("\\","/") for ex in excludes):
|
||||
continue
|
||||
parsed = read_markdown(abs_path)
|
||||
fm, body, p = _coerce_parsed(parsed)
|
||||
note_id = fm.get("id") or fm.get("note_id")
|
||||
if not note_id:
|
||||
continue
|
||||
rel = p if p else os.path.relpath(abs_path, vault)
|
||||
out.append({
|
||||
"note_id": note_id,
|
||||
"title": fm.get("title") or _slugify_filename(rel),
|
||||
"path": rel.replace("\\","/"),
|
||||
"fulltext": body,
|
||||
})
|
||||
return out
|
||||
|
||||
def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]:
|
||||
keep = []
|
||||
for e in edges:
|
||||
k = e.get("kind")
|
||||
if k in ("references", "backlink"):
|
||||
# Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#')
|
||||
# Alles andere unverändert lassen.
|
||||
keep.append(e)
|
||||
return keep
|
||||
|
||||
def unique_edges(edges: List[dict]) -> List[dict]:
|
||||
seen = set()
|
||||
out = []
|
||||
for e in edges:
|
||||
k = e.get("kind","edge")
|
||||
s = e.get("source_id","")
|
||||
t = e.get("target_id","")
|
||||
seq = e.get("seq","")
|
||||
key = (k,s,t,seq)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(e)
|
||||
return out
|
||||
|
||||
def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None:
|
||||
"""
|
||||
Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht:
|
||||
- kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets
|
||||
- kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets
|
||||
"""
|
||||
if not raw_targets:
|
||||
return
|
||||
# references (note-level)
|
||||
f1 = rest.Filter(
|
||||
must=[
|
||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")),
|
||||
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
|
||||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||||
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
|
||||
]
|
||||
)
|
||||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True)
|
||||
|
||||
# references_at (chunk-level) – optionales Aufräumen
|
||||
f2 = rest.Filter(
|
||||
must=[
|
||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")),
|
||||
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match
|
||||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||||
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
|
||||
]
|
||||
)
|
||||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True)
|
||||
|
||||
def main():
|
||||
load_dotenv()
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault")
|
||||
ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)")
|
||||
ap.add_argument("--prefix", help="Override COLLECTION_PREFIX")
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = QdrantConfig.from_env()
|
||||
if args.prefix:
|
||||
cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim)
|
||||
client = get_client(cfg)
|
||||
ensure_collections(client, cfg.prefix, cfg.dim)
|
||||
_, _, edges_col = collection_names(cfg.prefix)
|
||||
|
||||
# 1) Stubs sammeln und Index bauen
|
||||
notes = iter_note_stubs(args.vault)
|
||||
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug)
|
||||
|
||||
# 2) pro Note: Links im Volltext analysieren
|
||||
upserts_total = 0
|
||||
deletes_total = 0
|
||||
details = []
|
||||
for n in notes:
|
||||
edges = derive_wikilink_edges(n, [], idx)
|
||||
# nur references/backlink behalten
|
||||
edges = filter_only_refs_and_backlinks(edges)
|
||||
edges = unique_edges(edges)
|
||||
|
||||
# Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden)
|
||||
raw_targets = []
|
||||
for e in edges:
|
||||
if e.get("kind") == "references":
|
||||
# resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved"
|
||||
# Für Delete brauchen wir aber die alten raw labels
|
||||
raw_targets.append(e.get("raw") or e.get("target_label") or "")
|
||||
|
||||
# 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen)
|
||||
if args.apply:
|
||||
if raw_targets:
|
||||
before = 0
|
||||
delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r}))
|
||||
# (Qdrant gibt hier keine Count-Rückgabe – wir zählen pessimistisch nicht.)
|
||||
deletes_total += 1 # Marker pro Note mit Löschung
|
||||
# 3b) Upsert der „richtigen“ references + backlink
|
||||
if args.apply:
|
||||
col, pts = points_for_edges(cfg.prefix, edges)
|
||||
upsert_batch(client, col, pts)
|
||||
upserts_total += len(pts)
|
||||
|
||||
details.append({
|
||||
"note_id": n["note_id"],
|
||||
"new_refs": sum(1 for e in edges if e["kind"]=="references"),
|
||||
"new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"),
|
||||
})
|
||||
|
||||
print(json.dumps({
|
||||
"apply": bool(args.apply),
|
||||
"notes_scanned": len(notes),
|
||||
"edge_upserts": upserts_total,
|
||||
"notes_with_unresolved_cleanup": deletes_total,
|
||||
"prefix": cfg.prefix,
|
||||
"summary_sample": details[:5]
|
||||
}, ensure_ascii=False))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user