scripts/backfill_edges.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
This commit is contained in:
parent
0be419d165
commit
a9d4c2ec0e
|
|
@ -1,29 +1,54 @@
|
||||||
|
# scripts/backfill_edges.py
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, json, glob, os, sys
|
import argparse, glob, json, os
|
||||||
from app.core.parser import read_markdown, build_note_payload
|
from typing import List, Tuple
|
||||||
from app.core.validate_note import validate_note_payload
|
|
||||||
|
from app.core.parser import read_markdown # liefert (frontmatter_dict, body_str)
|
||||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
|
from app.core.qdrant import QdrantConfig, get_client, ensure_collections
|
||||||
from app.core.qdrant_points import points_for_edges, upsert_batch
|
from app.core.qdrant_points import points_for_edges, upsert_batch
|
||||||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
||||||
|
|
||||||
def iter_notes(vault: str, excludes: list[str]) -> list[tuple[dict, list[dict]]]:
|
def make_note_stub(path: str, fm: dict, body: str) -> dict:
|
||||||
|
"""
|
||||||
|
Minimaler Note-Payload nur für die Link-Auflösung und Kanten:
|
||||||
|
- note_id (aus Frontmatter), title, path
|
||||||
|
- fulltext = body (für Links im Gesamtdokument)
|
||||||
|
"""
|
||||||
|
note_id = fm.get("id") or fm.get("note_id")
|
||||||
|
title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0]
|
||||||
|
return {
|
||||||
|
"note_id": note_id,
|
||||||
|
"title": title,
|
||||||
|
"path": path.replace("\\", "/"),
|
||||||
|
"fulltext": body,
|
||||||
|
}
|
||||||
|
|
||||||
|
def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]:
|
||||||
|
"""
|
||||||
|
Liefert eine Liste von (note_stub, chunks_for_link_scan).
|
||||||
|
Für den Backfill reicht 1 Chunk (= gesamter Body), damit wir [[...]] finden.
|
||||||
|
"""
|
||||||
files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
|
files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
|
||||||
notes = []
|
out: List[Tuple[dict, List[dict]]] = []
|
||||||
for path in files:
|
for path in files:
|
||||||
if any(ex in path for ex in excludes):
|
if any(ex in path for ex in excludes):
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
fm, body = read_markdown(path)
|
fm, body = read_markdown(path)
|
||||||
payload = build_note_payload(path, fm, body)
|
stub = make_note_stub(path=os.path.relpath(path, vault), fm=fm, body=body)
|
||||||
validate_note_payload(payload) # wirft Exception wenn invalid
|
if not stub.get("note_id"):
|
||||||
# Dummy-Chunks lesen: Falls du schon Chunks erzeugst, nutze die aus dem Importer;
|
# ohne stabile ID können wir keine Edges sinnvoll referenzieren
|
||||||
# hier nehmen wir einen einfachen 1-Chunk-Fallback (ganzer Body), nur um Links zu finden:
|
continue
|
||||||
chunk = {"chunk_id": f"{payload['note_id']}#1", "note_id": payload["note_id"], "text": body}
|
chunk = {
|
||||||
notes.append((payload, [chunk]))
|
"chunk_id": f"{stub['note_id']}#1",
|
||||||
|
"note_id": stub["note_id"],
|
||||||
|
"text": body,
|
||||||
|
}
|
||||||
|
out.append((stub, [chunk]))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"skip {path}: {e}")
|
print(f"skip {path}: {e}")
|
||||||
return notes
|
return out
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
|
|
@ -35,18 +60,20 @@ def main():
|
||||||
client = get_client(cfg)
|
client = get_client(cfg)
|
||||||
ensure_collections(client, cfg.prefix, cfg.dim)
|
ensure_collections(client, cfg.prefix, cfg.dim)
|
||||||
|
|
||||||
# 1) alle Notes parsen
|
# 1) Notizen sammeln (stubs) + 1-Chunk pro Note für den Scan
|
||||||
note_tuples = iter_notes(args.vault, args.exclude)
|
note_tuples = iter_notes(args.vault, args.exclude)
|
||||||
note_payloads = [n for n, _ in note_tuples]
|
note_payloads = [n for n, _ in note_tuples]
|
||||||
|
|
||||||
|
# 2) Index für Zielauflösung
|
||||||
idx = build_note_index(note_payloads)
|
idx = build_note_index(note_payloads)
|
||||||
|
|
||||||
# 2) Edges ableiten
|
# 3) Edges ableiten
|
||||||
all_edges = []
|
all_edges = []
|
||||||
for note_payload, chunks in note_tuples:
|
for note_stub, chunks in note_tuples:
|
||||||
edges = derive_wikilink_edges(note_payload, chunks, idx)
|
edges = derive_wikilink_edges(note_stub, chunks, idx)
|
||||||
all_edges.extend(edges)
|
all_edges.extend(edges)
|
||||||
|
|
||||||
# 3) Upsert
|
# 4) Upsert
|
||||||
edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
|
edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
|
||||||
upsert_batch(client, edges_col, edge_pts)
|
upsert_batch(client, edges_col, edge_pts)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user