Dateien nach "scripts" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
4ea62e6886
commit
725271a7da
107
scripts/edges_dryrun.py
Normal file
107
scripts/edges_dryrun.py
Normal file
|
|
@ -0,0 +1,107 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# scripts/edges_dryrun.py
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Dry-Run: Erzeuge Edges aus einem Vault **ohne** Qdrant-Upsert.
|
||||||
|
- Liest Markdown mit YAML-Frontmatter
|
||||||
|
- Chunking: einfacher Absatz-Chunker (Index + text)
|
||||||
|
- Kanten: nutzt app.core.edges.build_edges_for_note()
|
||||||
|
- Ausgabe: JSON pro Note mit Edge-Counts und 3 Beispiel-Payloads
|
||||||
|
|
||||||
|
Aufruf:
|
||||||
|
python3 -m scripts.edges_dryrun --vault ./vault
|
||||||
|
|
||||||
|
Optional:
|
||||||
|
--include-note-scope-refs # auch Frontmatter-Links (links[].target_id) als Note-Scope-Referenzen
|
||||||
|
|
||||||
|
Voraussetzungen:
|
||||||
|
- app/core/parser.py (read_markdown, normalize_frontmatter)
|
||||||
|
- app/core/edges.py (dieses Modul forwardet zur v2-Implementierung)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from app.core.parser import read_markdown, normalize_frontmatter
|
||||||
|
from app.core.edges import build_edges_for_note
|
||||||
|
|
||||||
|
def _iter_markdown(vault: str):
|
||||||
|
for p in Path(vault).rglob("*.md"):
|
||||||
|
if p.name.startswith("."): # ignore hidden
|
||||||
|
continue
|
||||||
|
yield p
|
||||||
|
|
||||||
|
def _simple_chunker(body: str, note_id: str, note_type: str) -> List[Dict]:
|
||||||
|
# Absatzbasiert, minimal ausreichend für Edges (window/text, chunk_id, ord, note_id, type)
|
||||||
|
paras = [s.strip() for s in re.split(r"\n{2,}", body or "") if s.strip()]
|
||||||
|
chunks = []
|
||||||
|
for i, t in enumerate(paras):
|
||||||
|
chunks.append({
|
||||||
|
"chunk_id": f"{note_id}#c{i:04d}",
|
||||||
|
"ord": i,
|
||||||
|
"text": t,
|
||||||
|
"note_id": note_id,
|
||||||
|
"type": note_type,
|
||||||
|
})
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _fm_note_refs(fm: Dict) -> List[str]:
|
||||||
|
out = []
|
||||||
|
links = fm.get("links") or []
|
||||||
|
if isinstance(links, list):
|
||||||
|
for e in links:
|
||||||
|
if isinstance(e, dict):
|
||||||
|
tid = e.get("target_id")
|
||||||
|
if isinstance(tid, str) and tid.strip():
|
||||||
|
out.append(tid.strip())
|
||||||
|
return out
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--vault", required=True)
|
||||||
|
ap.add_argument("--include-note-scope-refs", action="store_true")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
vault = args.vault
|
||||||
|
include_note_scope = args.include_note_scope_refs
|
||||||
|
|
||||||
|
report = []
|
||||||
|
for p in _iter_markdown(vault):
|
||||||
|
parsed = read_markdown(str(p))
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
fm = normalize_frontmatter(parsed.frontmatter or {})
|
||||||
|
note_id = fm.get("id") or p.stem
|
||||||
|
note_type = (fm.get("type") or "concept").lower()
|
||||||
|
chunks = _simple_chunker(parsed.body, note_id, note_type)
|
||||||
|
note_refs = _fm_note_refs(fm)
|
||||||
|
|
||||||
|
edges = build_edges_for_note(
|
||||||
|
note_id=note_id,
|
||||||
|
chunk_payloads=chunks,
|
||||||
|
note_level_refs=note_refs,
|
||||||
|
include_note_scope_refs=include_note_scope,
|
||||||
|
)
|
||||||
|
kinds = {}
|
||||||
|
for e in edges:
|
||||||
|
key = (e.get("relation") or e.get("kind") or "edge")
|
||||||
|
kinds[key] = kinds.get(key, 0) + 1
|
||||||
|
report.append({
|
||||||
|
"path": str(p),
|
||||||
|
"note_id": note_id,
|
||||||
|
"type": note_type,
|
||||||
|
"chunks": len(chunks),
|
||||||
|
"edges_total": len(edges),
|
||||||
|
"edges_by_kind": kinds,
|
||||||
|
"samples": edges[:3],
|
||||||
|
})
|
||||||
|
|
||||||
|
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user