#!/usr/bin/env python3 # scripts/edges_dryrun.py # -*- coding: utf-8 -*- """ Dry-Run: Erzeuge Edges aus einem Vault **ohne** Qdrant-Upsert. - Liest Markdown mit YAML-Frontmatter - Chunking: einfacher Absatz-Chunker (Index + text) - Kanten: nutzt app.core.edges.build_edges_for_note() - Ausgabe: JSON pro Note mit Edge-Counts und 3 Beispiel-Payloads Aufruf: python3 -m scripts.edges_dryrun --vault ./vault Optional: --include-note-scope-refs # auch Frontmatter-Links (links[].target_id) als Note-Scope-Referenzen Voraussetzungen: - app/core/parser.py (read_markdown, normalize_frontmatter) - app/core/edges.py (dieses Modul forwardet zur v2-Implementierung) """ from __future__ import annotations import argparse import json import os import re from pathlib import Path from typing import Dict, List, Optional from app.core.parser import read_markdown, normalize_frontmatter from app.core.edges import build_edges_for_note def _iter_markdown(vault: str): for p in Path(vault).rglob("*.md"): if p.name.startswith("."): # ignore hidden continue yield p def _simple_chunker(body: str, note_id: str, note_type: str) -> List[Dict]: # Absatzbasiert, minimal ausreichend für Edges (window/text, chunk_id, ord, note_id, type) paras = [s.strip() for s in re.split(r"\n{2,}", body or "") if s.strip()] chunks = [] for i, t in enumerate(paras): chunks.append({ "chunk_id": f"{note_id}#c{i:04d}", "ord": i, "text": t, "note_id": note_id, "type": note_type, }) return chunks def _fm_note_refs(fm: Dict) -> List[str]: out = [] links = fm.get("links") or [] if isinstance(links, list): for e in links: if isinstance(e, dict): tid = e.get("target_id") if isinstance(tid, str) and tid.strip(): out.append(tid.strip()) return out def main(): ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True) ap.add_argument("--include-note-scope-refs", action="store_true") args = ap.parse_args() vault = args.vault include_note_scope = args.include_note_scope_refs report = [] for p in _iter_markdown(vault): parsed = read_markdown(str(p)) if not parsed: continue fm = normalize_frontmatter(parsed.frontmatter or {}) note_id = fm.get("id") or p.stem note_type = (fm.get("type") or "concept").lower() chunks = _simple_chunker(parsed.body, note_id, note_type) note_refs = _fm_note_refs(fm) edges = build_edges_for_note( note_id=note_id, chunk_payloads=chunks, note_level_refs=note_refs, include_note_scope_refs=include_note_scope, ) kinds = {} for e in edges: key = (e.get("relation") or e.get("kind") or "edge") kinds[key] = kinds.get(key, 0) + 1 report.append({ "path": str(p), "note_id": note_id, "type": note_type, "chunks": len(chunks), "edges_total": len(edges), "edges_by_kind": kinds, "samples": edges[:3], }) print(json.dumps(report, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()