From 0be419d165f4dc6508803178375536ea9b4fb5b9 Mon Sep 17 00:00:00 2001 From: Lars Date: Thu, 4 Sep 2025 09:27:52 +0200 Subject: [PATCH] =?UTF-8?q?scripts/backfill=5Fedges.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/backfill_edges.py | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/backfill_edges.py diff --git a/scripts/backfill_edges.py b/scripts/backfill_edges.py new file mode 100644 index 0000000..ca181da --- /dev/null +++ b/scripts/backfill_edges.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +from __future__ import annotations +import argparse, json, glob, os, sys +from app.core.parser import read_markdown, build_note_payload +from app.core.validate_note import validate_note_payload +from app.core.qdrant import QdrantConfig, get_client, ensure_collections +from app.core.qdrant_points import points_for_edges, upsert_batch +from app.core.derive_edges import build_note_index, derive_wikilink_edges + +def iter_notes(vault: str, excludes: list[str]) -> list[tuple[dict, list[dict]]]: + files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)] + notes = [] + for path in files: + if any(ex in path for ex in excludes): + continue + try: + fm, body = read_markdown(path) + payload = build_note_payload(path, fm, body) + validate_note_payload(payload) # wirft Exception wenn invalid + # Dummy-Chunks lesen: Falls du schon Chunks erzeugst, nutze die aus dem Importer; + # hier nehmen wir einen einfachen 1-Chunk-Fallback (ganzer Body), nur um Links zu finden: + chunk = {"chunk_id": f"{payload['note_id']}#1", "note_id": payload["note_id"], "text": body} + notes.append((payload, [chunk])) + except Exception as e: + print(f"skip {path}: {e}") + return notes + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True) + ap.add_argument("--exclude", nargs="*", default=["/.obsidian/", "/_backup_frontmatter/"]) + args = ap.parse_args() + + cfg = QdrantConfig.from_env() + client = get_client(cfg) + ensure_collections(client, cfg.prefix, cfg.dim) + + # 1) alle Notes parsen + note_tuples = iter_notes(args.vault, args.exclude) + note_payloads = [n for n, _ in note_tuples] + idx = build_note_index(note_payloads) + + # 2) Edges ableiten + all_edges = [] + for note_payload, chunks in note_tuples: + edges = derive_wikilink_edges(note_payload, chunks, idx) + all_edges.extend(edges) + + # 3) Upsert + edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges) + upsert_batch(client, edges_col, edge_pts) + + print(json.dumps({"edges_upserted": len(edge_pts)}, ensure_ascii=False)) + +if __name__ == "__main__": + main()