diff --git a/scripts/backfill_edges.py b/scripts/backfill_edges.py index ca181da..2482b25 100644 --- a/scripts/backfill_edges.py +++ b/scripts/backfill_edges.py @@ -1,29 +1,54 @@ +# scripts/backfill_edges.py #!/usr/bin/env python3 from __future__ import annotations -import argparse, json, glob, os, sys -from app.core.parser import read_markdown, build_note_payload -from app.core.validate_note import validate_note_payload +import argparse, glob, json, os +from typing import List, Tuple + +from app.core.parser import read_markdown # liefert (frontmatter_dict, body_str) from app.core.qdrant import QdrantConfig, get_client, ensure_collections from app.core.qdrant_points import points_for_edges, upsert_batch from app.core.derive_edges import build_note_index, derive_wikilink_edges -def iter_notes(vault: str, excludes: list[str]) -> list[tuple[dict, list[dict]]]: +def make_note_stub(path: str, fm: dict, body: str) -> dict: + """ + Minimaler Note-Payload nur für die Link-Auflösung und Kanten: + - note_id (aus Frontmatter), title, path + - fulltext = body (für Links im Gesamtdokument) + """ + note_id = fm.get("id") or fm.get("note_id") + title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0] + return { + "note_id": note_id, + "title": title, + "path": path.replace("\\", "/"), + "fulltext": body, + } + +def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]: + """ + Liefert eine Liste von (note_stub, chunks_for_link_scan). + Für den Backfill reicht 1 Chunk (= gesamter Body), damit wir [[...]] finden. + """ files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)] - notes = [] + out: List[Tuple[dict, List[dict]]] = [] for path in files: if any(ex in path for ex in excludes): continue try: fm, body = read_markdown(path) - payload = build_note_payload(path, fm, body) - validate_note_payload(payload) # wirft Exception wenn invalid - # Dummy-Chunks lesen: Falls du schon Chunks erzeugst, nutze die aus dem Importer; - # hier nehmen wir einen einfachen 1-Chunk-Fallback (ganzer Body), nur um Links zu finden: - chunk = {"chunk_id": f"{payload['note_id']}#1", "note_id": payload["note_id"], "text": body} - notes.append((payload, [chunk])) + stub = make_note_stub(path=os.path.relpath(path, vault), fm=fm, body=body) + if not stub.get("note_id"): + # ohne stabile ID können wir keine Edges sinnvoll referenzieren + continue + chunk = { + "chunk_id": f"{stub['note_id']}#1", + "note_id": stub["note_id"], + "text": body, + } + out.append((stub, [chunk])) except Exception as e: print(f"skip {path}: {e}") - return notes + return out def main(): ap = argparse.ArgumentParser() @@ -35,18 +60,20 @@ def main(): client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) - # 1) alle Notes parsen + # 1) Notizen sammeln (stubs) + 1-Chunk pro Note für den Scan note_tuples = iter_notes(args.vault, args.exclude) note_payloads = [n for n, _ in note_tuples] + + # 2) Index für Zielauflösung idx = build_note_index(note_payloads) - # 2) Edges ableiten + # 3) Edges ableiten all_edges = [] - for note_payload, chunks in note_tuples: - edges = derive_wikilink_edges(note_payload, chunks, idx) + for note_stub, chunks in note_tuples: + edges = derive_wikilink_edges(note_stub, chunks, idx) all_edges.extend(edges) - # 3) Upsert + # 4) Upsert edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges) upsert_batch(client, edges_col, edge_pts)