scripts/backfill_edges.py hinzugefügt
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
This commit is contained in:
parent
7f53afa797
commit
0be419d165
56
scripts/backfill_edges.py
Normal file
56
scripts/backfill_edges.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
import argparse, json, glob, os, sys
|
||||
from app.core.parser import read_markdown, build_note_payload
|
||||
from app.core.validate_note import validate_note_payload
|
||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
|
||||
from app.core.qdrant_points import points_for_edges, upsert_batch
|
||||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
||||
|
||||
def iter_notes(vault: str, excludes: list[str]) -> list[tuple[dict, list[dict]]]:
|
||||
files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
|
||||
notes = []
|
||||
for path in files:
|
||||
if any(ex in path for ex in excludes):
|
||||
continue
|
||||
try:
|
||||
fm, body = read_markdown(path)
|
||||
payload = build_note_payload(path, fm, body)
|
||||
validate_note_payload(payload) # wirft Exception wenn invalid
|
||||
# Dummy-Chunks lesen: Falls du schon Chunks erzeugst, nutze die aus dem Importer;
|
||||
# hier nehmen wir einen einfachen 1-Chunk-Fallback (ganzer Body), nur um Links zu finden:
|
||||
chunk = {"chunk_id": f"{payload['note_id']}#1", "note_id": payload["note_id"], "text": body}
|
||||
notes.append((payload, [chunk]))
|
||||
except Exception as e:
|
||||
print(f"skip {path}: {e}")
|
||||
return notes
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--vault", required=True)
|
||||
ap.add_argument("--exclude", nargs="*", default=["/.obsidian/", "/_backup_frontmatter/"])
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = QdrantConfig.from_env()
|
||||
client = get_client(cfg)
|
||||
ensure_collections(client, cfg.prefix, cfg.dim)
|
||||
|
||||
# 1) alle Notes parsen
|
||||
note_tuples = iter_notes(args.vault, args.exclude)
|
||||
note_payloads = [n for n, _ in note_tuples]
|
||||
idx = build_note_index(note_payloads)
|
||||
|
||||
# 2) Edges ableiten
|
||||
all_edges = []
|
||||
for note_payload, chunks in note_tuples:
|
||||
edges = derive_wikilink_edges(note_payload, chunks, idx)
|
||||
all_edges.extend(edges)
|
||||
|
||||
# 3) Upsert
|
||||
edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
|
||||
upsert_batch(client, edges_col, edge_pts)
|
||||
|
||||
print(json.dumps({"edges_upserted": len(edge_pts)}, ensure_ascii=False))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user