scripts/backfill_edges.py aktualisiert

2025-09-04 09:43:57 +02:00 · 2025-09-04 09:43:57 +02:00 · a9d4c2ec0e
commit a9d4c2ec0e
parent 0be419d165
1 changed files with 44 additions and 17 deletions
--- a/scripts/backfill_edges.py
+++ b/scripts/backfill_edges.py
@ -1,29 +1,54 @@
 # scripts/backfill_edges.py
 #!/usr/bin/env python3
 from __future__ import annotations
-import argparse, json, glob, os, sys
+import argparse, glob, json, os
-from app.core.parser import read_markdown, build_note_payload
+from typing import List, Tuple
-from app.core.validate_note import validate_note_payload
+
 from app.core.parser import read_markdown  # liefert (frontmatter_dict, body_str)
 from app.core.qdrant import QdrantConfig, get_client, ensure_collections
 from app.core.qdrant_points import points_for_edges, upsert_batch
 from app.core.derive_edges import build_note_index, derive_wikilink_edges
-def iter_notes(vault: str, excludes: list[str]) -> list[tuple[dict, list[dict]]]:
+def make_note_stub(path: str, fm: dict, body: str) -> dict:
    """
    Minimaler Note-Payload nur für die Link-Auflösung und Kanten:
    - note_id (aus Frontmatter), title, path
    - fulltext = body (für Links im Gesamtdokument)
    """
    note_id = fm.get("id") or fm.get("note_id")
    title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0]
    return {
        "note_id": note_id,
        "title": title,
        "path": path.replace("\\", "/"),
        "fulltext": body,
    }
 def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]:
    """
    Liefert eine Liste von (note_stub, chunks_for_link_scan).
    Für den Backfill reicht 1 Chunk (= gesamter Body), damit wir [[...]] finden.
    """
    files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
-    notes = []
+    out: List[Tuple[dict, List[dict]]] = []
    for path in files:
        if any(ex in path for ex in excludes):
            continue
        try:
            fm, body = read_markdown(path)
-            payload = build_note_payload(path, fm, body)
+            stub = make_note_stub(path=os.path.relpath(path, vault), fm=fm, body=body)
-            validate_note_payload(payload)  # wirft Exception wenn invalid
+            if not stub.get("note_id"):
-            # Dummy-Chunks lesen: Falls du schon Chunks erzeugst, nutze die aus dem Importer;
+                # ohne stabile ID können wir keine Edges sinnvoll referenzieren
-            # hier nehmen wir einen einfachen 1-Chunk-Fallback (ganzer Body), nur um Links zu finden:
+                continue
-            chunk = {"chunk_id": f"{payload['note_id']}#1", "note_id": payload["note_id"], "text": body}
+            chunk = {
-            notes.append((payload, [chunk]))
+                "chunk_id": f"{stub['note_id']}#1",
                "note_id": stub["note_id"],
                "text": body,
            }
            out.append((stub, [chunk]))
        except Exception as e:
            print(f"skip {path}: {e}")
-    return notes
+    return out
 def main():
    ap = argparse.ArgumentParser()
@ -35,18 +60,20 @@ def main():
    client = get_client(cfg)
    ensure_collections(client, cfg.prefix, cfg.dim)
-    # 1) alle Notes parsen
+    # 1) Notizen sammeln (stubs) + 1-Chunk pro Note für den Scan
    note_tuples = iter_notes(args.vault, args.exclude)
    note_payloads = [n for n, _ in note_tuples]
    # 2) Index für Zielauflösung
    idx = build_note_index(note_payloads)
-    # 2) Edges ableiten
+    # 3) Edges ableiten
    all_edges = []
-    for note_payload, chunks in note_tuples:
+    for note_stub, chunks in note_tuples:
-        edges = derive_wikilink_edges(note_payload, chunks, idx)
+        edges = derive_wikilink_edges(note_stub, chunks, idx)
        all_edges.extend(edges)
-    # 3) Upsert
+    # 4) Upsert
    edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
    upsert_batch(client, edges_col, edge_pts)