scripts/backfill_edges.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
This commit is contained in:
parent
d4a2d9c0e1
commit
41df416dfb
|
|
@ -1,19 +1,48 @@
|
|||
# scripts/backfill_edges.py
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
import argparse, glob, json, os
|
||||
from typing import List, Tuple
|
||||
|
||||
from app.core.parser import read_markdown # liefert (frontmatter_dict, body_str)
|
||||
from app.core.parser import read_markdown # gibt je nach Implementierung ein Objekt ODER ein (fm, body)-Tuple
|
||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
|
||||
from app.core.qdrant_points import points_for_edges, upsert_batch
|
||||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
||||
|
||||
|
||||
def _coerce_parsed(note_or_tuple):
|
||||
"""
|
||||
Unterstützt beide Varianten von read_markdown:
|
||||
- ParsedNote-ähnlich: hat .frontmatter, .body, .path?
|
||||
- Tuple: (frontmatter_dict, body_str)
|
||||
Gibt (frontmatter: dict, body: str, path: str) zurück.
|
||||
"""
|
||||
fm, body, path = None, None, None
|
||||
|
||||
# Objekt mit Attributen?
|
||||
if hasattr(note_or_tuple, "frontmatter") and hasattr(note_or_tuple, "body"):
|
||||
fm = getattr(note_or_tuple, "frontmatter") or {}
|
||||
body = getattr(note_or_tuple, "body") or ""
|
||||
# manche Implementationen haben .path (voll) oder .relpath
|
||||
if hasattr(note_or_tuple, "path"):
|
||||
path = getattr(note_or_tuple, "path")
|
||||
elif hasattr(note_or_tuple, "relpath"):
|
||||
path = getattr(note_or_tuple, "relpath")
|
||||
|
||||
# Tuple?
|
||||
elif isinstance(note_or_tuple, (tuple, list)) and len(note_or_tuple) >= 2:
|
||||
fm = note_or_tuple[0] or {}
|
||||
body = note_or_tuple[1] or ""
|
||||
# Pfad ist in dieser Variante unbekannt, wird extern gesetzt
|
||||
else:
|
||||
raise TypeError("Unsupported return type from read_markdown")
|
||||
|
||||
return fm, body, path
|
||||
|
||||
|
||||
def make_note_stub(path: str, fm: dict, body: str) -> dict:
|
||||
"""
|
||||
Minimaler Note-Payload nur für die Link-Auflösung und Kanten:
|
||||
- note_id (aus Frontmatter), title, path
|
||||
- fulltext = body (für Links im Gesamtdokument)
|
||||
Minimaler Note-Payload für Linkauflösung:
|
||||
- note_id (aus Frontmatter), title, path (relativ), fulltext=body
|
||||
"""
|
||||
note_id = fm.get("id") or fm.get("note_id")
|
||||
title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0]
|
||||
|
|
@ -24,22 +53,30 @@ def make_note_stub(path: str, fm: dict, body: str) -> dict:
|
|||
"fulltext": body,
|
||||
}
|
||||
|
||||
|
||||
def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]:
|
||||
"""
|
||||
Liefert eine Liste von (note_stub, chunks_for_link_scan).
|
||||
Für den Backfill reicht 1 Chunk (= gesamter Body), damit wir [[...]] finden.
|
||||
Liefert Liste von (note_stub, chunks_for_link_scan).
|
||||
Für Backfill reicht 1 Chunk (= gesamter Body), um [[...]] zu finden.
|
||||
"""
|
||||
files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
|
||||
out: List[Tuple[dict, List[dict]]] = []
|
||||
for path in files:
|
||||
if any(ex in path for ex in excludes):
|
||||
for abs_path in files:
|
||||
if any(ex in abs_path for ex in excludes):
|
||||
continue
|
||||
try:
|
||||
fm, body = read_markdown(path)
|
||||
stub = make_note_stub(path=os.path.relpath(path, vault), fm=fm, body=body)
|
||||
parsed = read_markdown(abs_path)
|
||||
fm, body, p = _coerce_parsed(parsed)
|
||||
|
||||
# falls read_markdown den Pfad nicht liefert -> relativ zum Vault bauen
|
||||
rel = p if p else os.path.relpath(abs_path, vault)
|
||||
|
||||
stub = make_note_stub(path=rel, fm=fm, body=body)
|
||||
if not stub.get("note_id"):
|
||||
# ohne stabile ID können wir keine Edges sinnvoll referenzieren
|
||||
# ohne stabile ID können wir keine Edges sauber referenzieren
|
||||
print(f"skip {rel}: missing note_id in frontmatter")
|
||||
continue
|
||||
|
||||
chunk = {
|
||||
"chunk_id": f"{stub['note_id']}#1",
|
||||
"note_id": stub["note_id"],
|
||||
|
|
@ -47,9 +84,10 @@ def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]
|
|||
}
|
||||
out.append((stub, [chunk]))
|
||||
except Exception as e:
|
||||
print(f"skip {path}: {e}")
|
||||
print(f"skip {abs_path}: {e}")
|
||||
return out
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--vault", required=True)
|
||||
|
|
@ -77,7 +115,8 @@ def main():
|
|||
edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
|
||||
upsert_batch(client, edges_col, edge_pts)
|
||||
|
||||
print(json.dumps({"edges_upserted": len(edge_pts)}, ensure_ascii=False))
|
||||
print(json.dumps({"notes_scanned": len(note_tuples), "edges_upserted": len(edge_pts)}, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user