224 lines
7.3 KiB
Python
224 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
# scripts/resolve_unresolved_references.py
|
|
"""
|
|
resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen
|
|
|
|
Version: 1.1.0 (Fixed for v2.6 Architecture)
|
|
|
|
Zweck
|
|
------
|
|
- Findet Edges in {prefix}_edges mit payload.status=="unresolved".
|
|
- Baut einen In-Memory Index aller Notizen (Titel/Alias -> ID).
|
|
- Aktualisiert die Edges (setzt target_id, entfernt status).
|
|
- Erzeugt symmetrische 'backlink'-Kanten für 'references'.
|
|
|
|
Aufruf
|
|
------
|
|
python3 -m scripts.resolve_unresolved_references --apply
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import json
|
|
import uuid
|
|
from typing import List, Dict, Any, Iterable
|
|
|
|
from qdrant_client import models
|
|
from app.core.qdrant import QdrantConfig, get_client
|
|
from app.core.qdrant_points import points_for_edges
|
|
|
|
# Logging Setup
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Hilfsfunktion: Erzeugt die Payload für den Backlink.
|
|
"""
|
|
return {
|
|
"source_id": target_note_id,
|
|
"target_id": source_note_id,
|
|
"kind": "backlink",
|
|
"scope": "note",
|
|
"text": f"Backlink from {extra.get('alias') or 'note'}",
|
|
"rule_id": "derived:backlink",
|
|
"confidence": 0.9
|
|
}
|
|
|
|
def build_lookup_index(client, collection_name: str) -> Dict[str, str]:
|
|
"""
|
|
Lädt ALLE Notizen und baut ein Mapping:
|
|
lower(title) -> note_id
|
|
lower(alias) -> note_id
|
|
"""
|
|
logger.info("Building lookup index from existing notes...")
|
|
lookup = {}
|
|
|
|
# Scroll über alle Notizen
|
|
next_offset = None
|
|
count = 0
|
|
while True:
|
|
records, next_offset = client.scroll(
|
|
collection_name=collection_name,
|
|
limit=1000,
|
|
offset=next_offset,
|
|
with_payload=True,
|
|
with_vectors=False
|
|
)
|
|
|
|
for record in records:
|
|
pl = record.payload or {}
|
|
nid = pl.get("note_id")
|
|
if not nid: continue
|
|
|
|
# 1. Titel
|
|
title = pl.get("title")
|
|
if title:
|
|
lookup[str(title).lower().strip()] = nid
|
|
|
|
# 2. Aliases (WP-11)
|
|
aliases = pl.get("aliases", [])
|
|
if isinstance(aliases, str): aliases = [aliases]
|
|
for a in aliases:
|
|
lookup[str(a).lower().strip()] = nid
|
|
|
|
count += len(records)
|
|
if next_offset is None:
|
|
break
|
|
|
|
logger.info(f"Index built. Mapped {len(lookup)} terms to {count} unique notes.")
|
|
return lookup
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--prefix", default=None, help="Collection prefix")
|
|
parser.add_argument("--apply", action="store_true", help="Write changes to DB")
|
|
parser.add_argument("--limit", type=int, default=0, help="Max edges to process (0=all)")
|
|
parser.add_argument("--batch", type=int, default=100, help="Upsert batch size")
|
|
args = parser.parse_args()
|
|
|
|
cfg = QdrantConfig.from_env()
|
|
if args.prefix:
|
|
cfg.prefix = args.prefix
|
|
|
|
client = get_client(cfg)
|
|
edges_col = f"{cfg.prefix}_edges"
|
|
notes_col = f"{cfg.prefix}_notes"
|
|
|
|
# 1. Index aufbauen
|
|
try:
|
|
lookup_index = build_lookup_index(client, notes_col)
|
|
except Exception as e:
|
|
logger.error(f"Failed to build index: {e}")
|
|
return
|
|
|
|
# 2. Unresolved Edges finden
|
|
logger.info(f"Scanning for unresolved edges in {edges_col}...")
|
|
|
|
scroll_filter = models.Filter(
|
|
must=[
|
|
models.FieldCondition(key="status", match=models.MatchValue(value="unresolved"))
|
|
]
|
|
)
|
|
|
|
unresolved_edges = []
|
|
next_page = None
|
|
while True:
|
|
res, next_page = client.scroll(
|
|
collection_name=edges_col,
|
|
scroll_filter=scroll_filter,
|
|
limit=500,
|
|
with_payload=True,
|
|
offset=next_page
|
|
)
|
|
unresolved_edges.extend(res)
|
|
if next_page is None or (args.limit > 0 and len(unresolved_edges) >= args.limit):
|
|
break
|
|
|
|
if args.limit > 0:
|
|
unresolved_edges = unresolved_edges[:args.limit]
|
|
|
|
logger.info(f"Found {len(unresolved_edges)} unresolved edges.")
|
|
|
|
# 3. Auflösen
|
|
to_fix = []
|
|
backlinks = []
|
|
resolved_count = 0
|
|
|
|
for pt in unresolved_edges:
|
|
pl = pt.payload
|
|
# Der gesuchte Begriff steckt oft in 'raw_target' (wenn Parser es speichert)
|
|
# oder wir nutzen die 'target_id', falls diese temporär den Namen hält (Legacy Parser Verhalten).
|
|
# Im v2.6 Parser ist die target_id bei unresolved links oft der slug oder name.
|
|
|
|
# Strategie: Wir schauen uns das Payload an.
|
|
# Fall A: derive_edges hat target_id="[[Missing Note]]" gesetzt (selten)
|
|
# Fall B: target_id ist der Slug/Titel in Kleinbuchstaben (häufig)
|
|
# Fall C: Es gibt ein Feld 'raw' oder 'text'
|
|
|
|
candidate = pl.get("target_id")
|
|
|
|
# Versuch der Auflösung
|
|
target_nid = lookup_index.get(str(candidate).lower().strip())
|
|
|
|
if target_nid:
|
|
# TREFFER!
|
|
new_pl = pl.copy()
|
|
new_pl["target_id"] = target_nid
|
|
new_pl.pop("status", None) # Status entfernen -> ist jetzt resolved
|
|
new_pl["resolution"] = "healed_by_script"
|
|
|
|
# Neue Edge ID generieren (Clean architecture)
|
|
# Wir behalten die alte ID NICHT, da die ID oft target_id enthält und wir Duplikate vermeiden wollen.
|
|
# Alternativ: Update auf bestehender ID. Wir machen hier ein Update.
|
|
|
|
to_fix.append({
|
|
"id": pt.id,
|
|
"payload": new_pl
|
|
})
|
|
|
|
# Backlink erzeugen? Nur wenn es eine Referenz ist
|
|
if pl.get("kind") == "references":
|
|
backlinks.append(_make_backlink(
|
|
source_note_id=pl.get("source_id"),
|
|
target_note_id=target_nid,
|
|
extra={"alias": candidate}
|
|
))
|
|
|
|
resolved_count += 1
|
|
|
|
logger.info(f"Resolvable: {resolved_count}/{len(unresolved_edges)}")
|
|
|
|
if not args.apply:
|
|
logger.info("DRY RUN. Use --apply to execute.")
|
|
return
|
|
|
|
# 4. Schreiben
|
|
if to_fix:
|
|
logger.info(f"Updating {len(to_fix)} edges...")
|
|
# Qdrant Update: Wir überschreiben den Point.
|
|
# Achtung: client.upsert erwartet PointStructs.
|
|
|
|
points_to_upsert = [
|
|
models.PointStruct(id=u["id"], payload=u["payload"], vector={})
|
|
for u in to_fix
|
|
]
|
|
|
|
# Batchweise
|
|
for i in range(0, len(points_to_upsert), args.batch):
|
|
batch = points_to_upsert[i:i+args.batch]
|
|
client.upsert(collection_name=edges_col, points=batch)
|
|
|
|
if backlinks:
|
|
logger.info(f"Creating {len(backlinks)} backlinks...")
|
|
# Hier nutzen wir den Helper aus qdrant_points für saubere IDs
|
|
col, bl_points = points_for_edges(backlinks, cfg.prefix)
|
|
# batchweise
|
|
for i in range(0, len(bl_points), args.batch):
|
|
batch = bl_points[i:i+args.batch]
|
|
client.upsert(collection_name=col, points=batch)
|
|
|
|
logger.info("Done.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |