From 41df416dfb7e19d1e962a09d8d89310d16a2bcdd Mon Sep 17 00:00:00 2001 From: Lars Date: Thu, 4 Sep 2025 10:03:46 +0200 Subject: [PATCH] scripts/backfill_edges.py aktualisiert --- scripts/backfill_edges.py | 67 +++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/scripts/backfill_edges.py b/scripts/backfill_edges.py index 2482b25..a5cd16a 100644 --- a/scripts/backfill_edges.py +++ b/scripts/backfill_edges.py @@ -1,19 +1,48 @@ -# scripts/backfill_edges.py #!/usr/bin/env python3 from __future__ import annotations import argparse, glob, json, os from typing import List, Tuple -from app.core.parser import read_markdown # liefert (frontmatter_dict, body_str) +from app.core.parser import read_markdown # gibt je nach Implementierung ein Objekt ODER ein (fm, body)-Tuple from app.core.qdrant import QdrantConfig, get_client, ensure_collections from app.core.qdrant_points import points_for_edges, upsert_batch from app.core.derive_edges import build_note_index, derive_wikilink_edges + +def _coerce_parsed(note_or_tuple): + """ + Unterstützt beide Varianten von read_markdown: + - ParsedNote-ähnlich: hat .frontmatter, .body, .path? + - Tuple: (frontmatter_dict, body_str) + Gibt (frontmatter: dict, body: str, path: str) zurück. + """ + fm, body, path = None, None, None + + # Objekt mit Attributen? + if hasattr(note_or_tuple, "frontmatter") and hasattr(note_or_tuple, "body"): + fm = getattr(note_or_tuple, "frontmatter") or {} + body = getattr(note_or_tuple, "body") or "" + # manche Implementationen haben .path (voll) oder .relpath + if hasattr(note_or_tuple, "path"): + path = getattr(note_or_tuple, "path") + elif hasattr(note_or_tuple, "relpath"): + path = getattr(note_or_tuple, "relpath") + + # Tuple? + elif isinstance(note_or_tuple, (tuple, list)) and len(note_or_tuple) >= 2: + fm = note_or_tuple[0] or {} + body = note_or_tuple[1] or "" + # Pfad ist in dieser Variante unbekannt, wird extern gesetzt + else: + raise TypeError("Unsupported return type from read_markdown") + + return fm, body, path + + def make_note_stub(path: str, fm: dict, body: str) -> dict: """ - Minimaler Note-Payload nur für die Link-Auflösung und Kanten: - - note_id (aus Frontmatter), title, path - - fulltext = body (für Links im Gesamtdokument) + Minimaler Note-Payload für Linkauflösung: + - note_id (aus Frontmatter), title, path (relativ), fulltext=body """ note_id = fm.get("id") or fm.get("note_id") title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0] @@ -24,22 +53,30 @@ def make_note_stub(path: str, fm: dict, body: str) -> dict: "fulltext": body, } + def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]: """ - Liefert eine Liste von (note_stub, chunks_for_link_scan). - Für den Backfill reicht 1 Chunk (= gesamter Body), damit wir [[...]] finden. + Liefert Liste von (note_stub, chunks_for_link_scan). + Für Backfill reicht 1 Chunk (= gesamter Body), um [[...]] zu finden. """ files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)] out: List[Tuple[dict, List[dict]]] = [] - for path in files: - if any(ex in path for ex in excludes): + for abs_path in files: + if any(ex in abs_path for ex in excludes): continue try: - fm, body = read_markdown(path) - stub = make_note_stub(path=os.path.relpath(path, vault), fm=fm, body=body) + parsed = read_markdown(abs_path) + fm, body, p = _coerce_parsed(parsed) + + # falls read_markdown den Pfad nicht liefert -> relativ zum Vault bauen + rel = p if p else os.path.relpath(abs_path, vault) + + stub = make_note_stub(path=rel, fm=fm, body=body) if not stub.get("note_id"): - # ohne stabile ID können wir keine Edges sinnvoll referenzieren + # ohne stabile ID können wir keine Edges sauber referenzieren + print(f"skip {rel}: missing note_id in frontmatter") continue + chunk = { "chunk_id": f"{stub['note_id']}#1", "note_id": stub["note_id"], @@ -47,9 +84,10 @@ def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]] } out.append((stub, [chunk])) except Exception as e: - print(f"skip {path}: {e}") + print(f"skip {abs_path}: {e}") return out + def main(): ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True) @@ -77,7 +115,8 @@ def main(): edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges) upsert_batch(client, edges_col, edge_pts) - print(json.dumps({"edges_upserted": len(edge_pts)}, ensure_ascii=False)) + print(json.dumps({"notes_scanned": len(note_tuples), "edges_upserted": len(edge_pts)}, ensure_ascii=False)) + if __name__ == "__main__": main()