mindnet/scripts/backfill_edges.py
Lars 41df416dfb
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
scripts/backfill_edges.py aktualisiert
2025-09-04 10:03:46 +02:00

123 lines
4.3 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse, glob, json, os
from typing import List, Tuple
from app.core.parser import read_markdown # gibt je nach Implementierung ein Objekt ODER ein (fm, body)-Tuple
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
from app.core.qdrant_points import points_for_edges, upsert_batch
from app.core.derive_edges import build_note_index, derive_wikilink_edges
def _coerce_parsed(note_or_tuple):
"""
Unterstützt beide Varianten von read_markdown:
- ParsedNote-ähnlich: hat .frontmatter, .body, .path?
- Tuple: (frontmatter_dict, body_str)
Gibt (frontmatter: dict, body: str, path: str) zurück.
"""
fm, body, path = None, None, None
# Objekt mit Attributen?
if hasattr(note_or_tuple, "frontmatter") and hasattr(note_or_tuple, "body"):
fm = getattr(note_or_tuple, "frontmatter") or {}
body = getattr(note_or_tuple, "body") or ""
# manche Implementationen haben .path (voll) oder .relpath
if hasattr(note_or_tuple, "path"):
path = getattr(note_or_tuple, "path")
elif hasattr(note_or_tuple, "relpath"):
path = getattr(note_or_tuple, "relpath")
# Tuple?
elif isinstance(note_or_tuple, (tuple, list)) and len(note_or_tuple) >= 2:
fm = note_or_tuple[0] or {}
body = note_or_tuple[1] or ""
# Pfad ist in dieser Variante unbekannt, wird extern gesetzt
else:
raise TypeError("Unsupported return type from read_markdown")
return fm, body, path
def make_note_stub(path: str, fm: dict, body: str) -> dict:
"""
Minimaler Note-Payload für Linkauflösung:
- note_id (aus Frontmatter), title, path (relativ), fulltext=body
"""
note_id = fm.get("id") or fm.get("note_id")
title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0]
return {
"note_id": note_id,
"title": title,
"path": path.replace("\\", "/"),
"fulltext": body,
}
def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]:
"""
Liefert Liste von (note_stub, chunks_for_link_scan).
Für Backfill reicht 1 Chunk (= gesamter Body), um [[...]] zu finden.
"""
files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
out: List[Tuple[dict, List[dict]]] = []
for abs_path in files:
if any(ex in abs_path for ex in excludes):
continue
try:
parsed = read_markdown(abs_path)
fm, body, p = _coerce_parsed(parsed)
# falls read_markdown den Pfad nicht liefert -> relativ zum Vault bauen
rel = p if p else os.path.relpath(abs_path, vault)
stub = make_note_stub(path=rel, fm=fm, body=body)
if not stub.get("note_id"):
# ohne stabile ID können wir keine Edges sauber referenzieren
print(f"skip {rel}: missing note_id in frontmatter")
continue
chunk = {
"chunk_id": f"{stub['note_id']}#1",
"note_id": stub["note_id"],
"text": body,
}
out.append((stub, [chunk]))
except Exception as e:
print(f"skip {abs_path}: {e}")
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True)
ap.add_argument("--exclude", nargs="*", default=["/.obsidian/", "/_backup_frontmatter/"])
args = ap.parse_args()
cfg = QdrantConfig.from_env()
client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim)
# 1) Notizen sammeln (stubs) + 1-Chunk pro Note für den Scan
note_tuples = iter_notes(args.vault, args.exclude)
note_payloads = [n for n, _ in note_tuples]
# 2) Index für Zielauflösung
idx = build_note_index(note_payloads)
# 3) Edges ableiten
all_edges = []
for note_stub, chunks in note_tuples:
edges = derive_wikilink_edges(note_stub, chunks, idx)
all_edges.extend(edges)
# 4) Upsert
edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
upsert_batch(client, edges_col, edge_pts)
print(json.dumps({"notes_scanned": len(note_tuples), "edges_upserted": len(edge_pts)}, ensure_ascii=False))
if __name__ == "__main__":
main()