scripts/resolve_unresolved_references.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
a202c53594
commit
df33293621
|
|
@ -1,208 +1,243 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# scripts/resolve_unresolved_references.py
|
# scripts/resolve_unresolved_references.py
|
||||||
#
|
"""
|
||||||
# Zweck
|
resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen
|
||||||
# -----
|
|
||||||
# Repariert nachträglich "unresolved" Wikilinks (edges.kind == "references")
|
|
||||||
# indem es:
|
|
||||||
# 1) alle Notizen im Vault einliest und einen Resolving-Index (by id/slug/file_slug) aufbaut,
|
|
||||||
# 2) pro Notiz die Wikilinks im Volltext neu auswertet,
|
|
||||||
# 3) für auflösbare Ziele stabile `references` + `backlink`-Kanten upsertet,
|
|
||||||
# 4) dazugehörige "unresolved" `references` (und optionale "unresolved" `references_at`) löscht.
|
|
||||||
#
|
|
||||||
# Aufrufparameter
|
|
||||||
# ---------------
|
|
||||||
# --vault /pfad/zum/vault (Erforderlich)
|
|
||||||
# --apply (Optional) Ohne Flag: Dry-Run (nur Zusammenfassung)
|
|
||||||
# --prefix <name> (Optional) Override COLLECTION_PREFIX
|
|
||||||
#
|
|
||||||
# Hinweise
|
|
||||||
# --------
|
|
||||||
# - Dieses Script fasst NUR `references` + `backlink` an (keine `references_at`).
|
|
||||||
# - Es nutzt dieselben Resolver-Regeln wie der Importer (id, slug(title), file_slug).
|
|
||||||
# - Edge-IDs sind stabil (kind:source->target#seq) und kompatibel mit dem Importer.
|
|
||||||
# - Für das Löschen "unresolved" nutzt es Qdrant-Filter (kein "minimum_should"-Feld o.ä.).
|
|
||||||
#
|
|
||||||
# Version
|
|
||||||
# -------
|
|
||||||
# v1.0.0 (2025-09-05)
|
|
||||||
# - Erste Version: Resolve/Upsert für references/backlink, targeted cleanup für unresolved.
|
|
||||||
#
|
|
||||||
# Änderungshinweise
|
|
||||||
# -----------------
|
|
||||||
# - Keine Vorgängerversion (neu).
|
|
||||||
#
|
|
||||||
|
|
||||||
|
Version: 1.0.0 (2025-09-05)
|
||||||
|
|
||||||
|
Zweck
|
||||||
|
------
|
||||||
|
- Findet Edges in {prefix}_edges mit payload.status=="unresolved" und versucht, den Zielknoten
|
||||||
|
anhand bereits vorhandener Notes in {prefix}_notes aufzulösen.
|
||||||
|
- Aktualisiert die Edges (setzt target_id, entfernt status, setzt resolution), und erzeugt
|
||||||
|
– NUR für Note-Level 'references' – die symmetrische 'backlink'-Kante.
|
||||||
|
|
||||||
|
Warum?
|
||||||
|
------
|
||||||
|
- Beim ersten Import können Links auf (noch) nicht existierende Notizen zeigen.
|
||||||
|
- Sobald die Zielnotiz später existiert, kann dieses Skript die Kanten reparieren.
|
||||||
|
|
||||||
|
Aufruf
|
||||||
|
------
|
||||||
|
# Dry-Run (Standard):
|
||||||
|
python3 -m scripts.resolve_unresolved_references --prefix mindnet
|
||||||
|
|
||||||
|
# Anwenden:
|
||||||
|
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply
|
||||||
|
|
||||||
|
# Optional: nur X Edges anfassen
|
||||||
|
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply --limit 500
|
||||||
|
|
||||||
|
Parameter
|
||||||
|
---------
|
||||||
|
--prefix : Collection-Prefix (Default: aus Env COLLECION_PREFIX oder "mindnet")
|
||||||
|
--apply : Änderungen tatsächlich schreiben (ohne --apply = Dry-Run)
|
||||||
|
--limit : Max. Anzahl unaufgelöster Edges, die in diesem Lauf bearbeitet werden (Default: keine Begrenzung)
|
||||||
|
--batch : Upsert-Batchgröße (Default: 512)
|
||||||
|
|
||||||
|
Voraussetzungen / Hinweise
|
||||||
|
--------------------------
|
||||||
|
- Bitte im aktivierten venv laufen lassen (deine Umgebung: `.venv`).
|
||||||
|
- Qdrant-URL/Key/Prefix/Vektor-Dim werden wie üblich aus ENV gelesen (sieh app/core/qdrant.py). # noqa
|
||||||
|
- Nutzt die vorhandenen Utilities:
|
||||||
|
- app/core/qdrant.py (Client/Collections)
|
||||||
|
- app/core/qdrant_points.py (points_for_edges/upsert_batch)
|
||||||
|
- app/core/derive_edges.py (build_note_index/resolve_target)
|
||||||
|
|
||||||
|
Änderungshistorie
|
||||||
|
-----------------
|
||||||
|
1.0.0 Erstveröffentlichung.
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, glob, json, os, sys
|
|
||||||
from typing import List, Tuple, Dict
|
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
import argparse
|
||||||
|
import json
|
||||||
|
from typing import Any, Dict, List, Tuple, Iterable
|
||||||
|
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
from qdrant_client.http import models as rest
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
from app.core.parser import read_markdown
|
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names # :contentReference[oaicite:3]{index=3}
|
||||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names
|
from app.core.qdrant_points import points_for_edges, upsert_batch # :contentReference[oaicite:4]{index=4}
|
||||||
from app.core.qdrant_points import points_for_edges, upsert_batch
|
from app.core.derive_edges import build_note_index, resolve_target # :contentReference[oaicite:5]{index=5}
|
||||||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
|
||||||
|
|
||||||
# ---- helpers ----
|
|
||||||
|
|
||||||
def _coerce_parsed(p):
|
def _scroll(client: QdrantClient, **kwargs):
|
||||||
"""Erlaubt ParsedNote-Objekt oder (fm, body)-Tuple."""
|
|
||||||
if hasattr(p, "frontmatter") and hasattr(p, "body"):
|
|
||||||
fm = dict(p.frontmatter or {})
|
|
||||||
body = p.body or ""
|
|
||||||
path = getattr(p, "path", None)
|
|
||||||
return fm, body, path
|
|
||||||
if isinstance(p, (list, tuple)) and len(p) >= 2:
|
|
||||||
fm = dict(p[0] or {})
|
|
||||||
body = p[1] or ""
|
|
||||||
return fm, body, None
|
|
||||||
raise TypeError("Unsupported return type from read_markdown")
|
|
||||||
|
|
||||||
def _slugify_filename(path: str) -> str:
|
|
||||||
base = os.path.basename(path).rsplit(".", 1)[0]
|
|
||||||
return base
|
|
||||||
|
|
||||||
def iter_note_stubs(vault: str, excludes=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[Dict]:
|
|
||||||
files = [p for p in glob.glob(os.path.join(vault, "**", "*.md"), recursive=True)]
|
|
||||||
out: List[Dict] = []
|
|
||||||
for abs_path in files:
|
|
||||||
if any(ex in abs_path.replace("\\","/") for ex in excludes):
|
|
||||||
continue
|
|
||||||
parsed = read_markdown(abs_path)
|
|
||||||
fm, body, p = _coerce_parsed(parsed)
|
|
||||||
note_id = fm.get("id") or fm.get("note_id")
|
|
||||||
if not note_id:
|
|
||||||
continue
|
|
||||||
rel = p if p else os.path.relpath(abs_path, vault)
|
|
||||||
out.append({
|
|
||||||
"note_id": note_id,
|
|
||||||
"title": fm.get("title") or _slugify_filename(rel),
|
|
||||||
"path": rel.replace("\\","/"),
|
|
||||||
"fulltext": body,
|
|
||||||
})
|
|
||||||
return out
|
|
||||||
|
|
||||||
def filter_only_refs_and_backlinks(edges: List[dict]) -> List[dict]:
|
|
||||||
keep = []
|
|
||||||
for e in edges:
|
|
||||||
k = e.get("kind")
|
|
||||||
if k in ("references", "backlink"):
|
|
||||||
# Für Volltext-refs gibt's keine 'seq' (-> stabiler edge_id Suffix '#')
|
|
||||||
# Alles andere unverändert lassen.
|
|
||||||
keep.append(e)
|
|
||||||
return keep
|
|
||||||
|
|
||||||
def unique_edges(edges: List[dict]) -> List[dict]:
|
|
||||||
seen = set()
|
|
||||||
out = []
|
|
||||||
for e in edges:
|
|
||||||
k = e.get("kind","edge")
|
|
||||||
s = e.get("source_id","")
|
|
||||||
t = e.get("target_id","")
|
|
||||||
seq = e.get("seq","")
|
|
||||||
key = (k,s,t,seq)
|
|
||||||
if key in seen:
|
|
||||||
continue
|
|
||||||
seen.add(key)
|
|
||||||
out.append(e)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def delete_unresolved_for_note(client, edges_col: str, note_id: str, raw_targets: List[str]) -> None:
|
|
||||||
"""
|
"""
|
||||||
Löscht "unresolved" references (und optional references_at) aus NOTE-Sicht:
|
Wrapper um qdrant_client.scroll() für unterschiedliche Client-Versionen:
|
||||||
- kind=='references' AND source_id==note_id AND status=='unresolved' AND target_label in raw_targets
|
neuere: (points, next_offset)
|
||||||
- kind=='references_at' AND source_id startswith note_id+'#' AND status=='unresolved' AND target_label in raw_targets
|
ältere: (points, next_page_offset, _)
|
||||||
"""
|
"""
|
||||||
if not raw_targets:
|
res = client.scroll(**kwargs)
|
||||||
return
|
if isinstance(res, tuple):
|
||||||
# references (note-level)
|
if len(res) == 2:
|
||||||
f1 = rest.Filter(
|
points, next_off = res
|
||||||
|
else:
|
||||||
|
# ältere Signatur: (points, next_off, _)
|
||||||
|
points, next_off, _ = res[0], res[1], res[2]
|
||||||
|
else:
|
||||||
|
# sehr alte Clients -> konservativ behandeln
|
||||||
|
points, next_off = res, None
|
||||||
|
return points, next_off
|
||||||
|
|
||||||
|
|
||||||
|
def _load_all_notes(client: QdrantClient, notes_col: str) -> List[Dict[str, Any]]:
|
||||||
|
notes: List[Dict[str, Any]] = []
|
||||||
|
next_off = None
|
||||||
|
while True:
|
||||||
|
pts, next_off = _scroll(
|
||||||
|
client,
|
||||||
|
collection_name=notes_col,
|
||||||
|
with_payload=True,
|
||||||
|
with_vectors=False,
|
||||||
|
limit=1024,
|
||||||
|
offset=next_off,
|
||||||
|
)
|
||||||
|
for p in pts or []:
|
||||||
|
pl = getattr(p, "payload", {}) or {}
|
||||||
|
# Erwartet Felder: note_id, title, path etc. (gemäß Schema) # :contentReference[oaicite:6]{index=6}
|
||||||
|
if pl.get("note_id"):
|
||||||
|
notes.append(pl)
|
||||||
|
if not next_off:
|
||||||
|
break
|
||||||
|
return notes
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_unresolved_edges(client: QdrantClient, edges_col: str) -> Iterable[rest.Record]:
|
||||||
|
"""
|
||||||
|
Liefert alle Edge-Records mit payload.status == 'unresolved' und 'target_label' (string).
|
||||||
|
"""
|
||||||
|
f = rest.Filter(
|
||||||
must=[
|
must=[
|
||||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references")),
|
|
||||||
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
|
|
||||||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||||||
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f1), wait=True)
|
next_off = None
|
||||||
|
while True:
|
||||||
|
pts, next_off = _scroll(
|
||||||
|
client,
|
||||||
|
collection_name=edges_col,
|
||||||
|
scroll_filter=f,
|
||||||
|
with_payload=True,
|
||||||
|
with_vectors=False,
|
||||||
|
limit=1024,
|
||||||
|
offset=next_off,
|
||||||
|
)
|
||||||
|
for p in pts or []:
|
||||||
|
pl = getattr(p, "payload", {}) or {}
|
||||||
|
if isinstance(pl.get("target_label"), str):
|
||||||
|
yield p
|
||||||
|
if not next_off:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Baue eine 'backlink'-Edge-Payload source <- target (note-level).
|
||||||
|
"""
|
||||||
|
e = {
|
||||||
|
"kind": "backlink",
|
||||||
|
"source_id": target_note_id,
|
||||||
|
"target_id": source_note_id,
|
||||||
|
}
|
||||||
|
# Metafelder aus dem Original übernehmen (ohne status)
|
||||||
|
copy_keys = ["raw", "alias", "heading", "resolution"]
|
||||||
|
for k in copy_keys:
|
||||||
|
if k in extra:
|
||||||
|
e[k] = extra[k]
|
||||||
|
return e
|
||||||
|
|
||||||
# references_at (chunk-level) – optionales Aufräumen
|
|
||||||
f2 = rest.Filter(
|
|
||||||
must=[
|
|
||||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value="references_at")),
|
|
||||||
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), # prefix match
|
|
||||||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
|
||||||
rest.FieldCondition(key="target_label", match=rest.MatchAny(any=raw_targets)),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=f2), wait=True)
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
load_dotenv()
|
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault")
|
ap.add_argument("--prefix", help="Collection-Prefix (Default: Env/COLLECTION_PREFIX oder 'mindnet')")
|
||||||
ap.add_argument("--apply", action="store_true", help="Schreibt Änderungen (ohne Flag: Dry-Run)")
|
ap.add_argument("--apply", action="store_true", help="Änderungen schreiben (ohne Flag = Dry-Run)")
|
||||||
ap.add_argument("--prefix", help="Override COLLECTION_PREFIX")
|
ap.add_argument("--limit", type=int, default=0, help="Max. Anzahl unaufgelöster Edges bearbeiten (0 = alle)")
|
||||||
|
ap.add_argument("--batch", type=int, default=512, help="Upsert-Batchgröße")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
# Qdrant-Setup
|
||||||
cfg = QdrantConfig.from_env()
|
cfg = QdrantConfig.from_env()
|
||||||
if args.prefix:
|
if args.prefix:
|
||||||
cfg = QdrantConfig(url=cfg.url, api_key=cfg.api_key, prefix=args.prefix, dim=cfg.dim)
|
cfg.prefix = args.prefix
|
||||||
client = get_client(cfg)
|
client = get_client(cfg)
|
||||||
ensure_collections(client, cfg.prefix, cfg.dim)
|
ensure_collections(client, cfg.prefix, cfg.dim) # sorgt u. a. für 1D-Vektor-Collection bei Edges :contentReference[oaicite:7]{index=7}
|
||||||
_, _, edges_col = collection_names(cfg.prefix)
|
notes_col, _, edges_col = collection_names(cfg.prefix) # :contentReference[oaicite:8]{index=8}
|
||||||
|
|
||||||
# 1) Stubs sammeln und Index bauen
|
# Notes laden & Index bauen
|
||||||
notes = iter_note_stubs(args.vault)
|
notes = _load_all_notes(client, notes_col)
|
||||||
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug)
|
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) :contentReference[oaicite:9]{index=9}
|
||||||
|
|
||||||
# 2) pro Note: Links im Volltext analysieren
|
# Unresolved-Edges scannen
|
||||||
upserts_total = 0
|
to_fix: List[dict] = []
|
||||||
deletes_total = 0
|
backlinks: List[dict] = []
|
||||||
details = []
|
processed = 0
|
||||||
for n in notes:
|
resolved = 0
|
||||||
edges = derive_wikilink_edges(n, [], idx)
|
|
||||||
# nur references/backlink behalten
|
|
||||||
edges = filter_only_refs_and_backlinks(edges)
|
|
||||||
edges = unique_edges(edges)
|
|
||||||
|
|
||||||
# Kandidaten für unresolved-Delete (raw labels, die jetzt auflösbar wurden)
|
for rec in _iter_unresolved_edges(client, edges_col):
|
||||||
raw_targets = []
|
if args.limit and processed >= args.limit:
|
||||||
for e in edges:
|
break
|
||||||
if e.get("kind") == "references":
|
processed += 1
|
||||||
# resolved haben target_id als echte Note-ID; unresolved hätten "status":"unresolved"
|
|
||||||
# Für Delete brauchen wir aber die alten raw labels
|
|
||||||
raw_targets.append(e.get("raw") or e.get("target_label") or "")
|
|
||||||
|
|
||||||
# 3a) Löschen alter unresolved-refs (nur falls wir wirklich updaten wollen)
|
pl = dict(rec.payload or {})
|
||||||
if args.apply:
|
kind = pl.get("kind") or "references"
|
||||||
if raw_targets:
|
src = pl.get("source_id")
|
||||||
before = 0
|
tgt_label = pl.get("target_label") or pl.get("target_id") # Fallback
|
||||||
delete_unresolved_for_note(client, edges_col, n["note_id"], list({r for r in raw_targets if r}))
|
|
||||||
# (Qdrant gibt hier keine Count-Rückgabe – wir zählen pessimistisch nicht.)
|
|
||||||
deletes_total += 1 # Marker pro Note mit Löschung
|
|
||||||
# 3b) Upsert der „richtigen“ references + backlink
|
|
||||||
if args.apply:
|
|
||||||
col, pts = points_for_edges(cfg.prefix, edges)
|
|
||||||
upsert_batch(client, col, pts)
|
|
||||||
upserts_total += len(pts)
|
|
||||||
|
|
||||||
details.append({
|
# Zielauflösung
|
||||||
"note_id": n["note_id"],
|
resolved_id, how = resolve_target(str(tgt_label), idx) # :contentReference[oaicite:10]{index=10}
|
||||||
"new_refs": sum(1 for e in edges if e["kind"]=="references"),
|
if not resolved_id:
|
||||||
"new_backlinks": sum(1 for e in edges if e["kind"]=="backlink"),
|
continue # weiterhin unresolved
|
||||||
})
|
|
||||||
|
|
||||||
print(json.dumps({
|
# Edge-Update
|
||||||
"apply": bool(args.apply),
|
new_pl = dict(pl)
|
||||||
"notes_scanned": len(notes),
|
new_pl["target_id"] = resolved_id
|
||||||
"edge_upserts": upserts_total,
|
new_pl["resolution"] = how
|
||||||
"notes_with_unresolved_cleanup": deletes_total,
|
if "status" in new_pl:
|
||||||
|
del new_pl["status"]
|
||||||
|
# ID stabil lassen -> points_for_edges erzeugt UUID aus edge_id/Fallback :contentReference[oaicite:11]{index=11}
|
||||||
|
if "edge_id" not in new_pl:
|
||||||
|
# stabiler Key aus (kind, src, tgt, evtl. seq)
|
||||||
|
seq = new_pl.get("seq") or new_pl.get("order") or ""
|
||||||
|
new_pl["edge_id"] = f"{kind}:{src}->{resolved_id}#{seq}"
|
||||||
|
|
||||||
|
to_fix.append(new_pl)
|
||||||
|
resolved += 1
|
||||||
|
|
||||||
|
# Nur bei Note-Level references (nicht references_at) -> Backlink erzeugen
|
||||||
|
if kind == "references":
|
||||||
|
extra = {k: new_pl.get(k) for k in ("raw", "alias", "heading")}
|
||||||
|
extra["resolution"] = how
|
||||||
|
backlinks.append(_make_backlink(source_note_id=src, target_note_id=resolved_id, extra=extra))
|
||||||
|
|
||||||
|
# Ergebnis ausgeben
|
||||||
|
summary = {
|
||||||
"prefix": cfg.prefix,
|
"prefix": cfg.prefix,
|
||||||
"summary_sample": details[:5]
|
"scanned_unresolved": processed,
|
||||||
}, ensure_ascii=False))
|
"resolved": resolved,
|
||||||
|
"backlinks_to_create": len(backlinks),
|
||||||
|
"apply": bool(args.apply),
|
||||||
|
}
|
||||||
|
print(json.dumps(summary, ensure_ascii=False))
|
||||||
|
|
||||||
|
if not args.apply:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Upserts (in Batches)
|
||||||
|
def _batched(items: List[dict], n: int) -> Iterable[List[dict]]:
|
||||||
|
for i in range(0, len(items), n):
|
||||||
|
yield items[i : i + n]
|
||||||
|
|
||||||
|
# 1) Updates für reparierte Edges
|
||||||
|
for chunk in _batched(to_fix, args.batch):
|
||||||
|
col, pts = points_for_edges(cfg.prefix, chunk) # sorgt für Edge-UUID & Dummy-Vector :contentReference[oaicite:12]{index=12}
|
||||||
|
upsert_batch(client, col, pts)
|
||||||
|
|
||||||
|
# 2) Backlinks (nur references)
|
||||||
|
for chunk in _batched(backlinks, args.batch):
|
||||||
|
col, pts = points_for_edges(cfg.prefix, chunk)
|
||||||
|
upsert_batch(client, col, pts)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user