diff --git a/scripts/edges_dryrun.py b/scripts/edges_dryrun.py new file mode 100644 index 0000000..c6daf1a --- /dev/null +++ b/scripts/edges_dryrun.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# scripts/edges_dryrun.py +# -*- coding: utf-8 -*- +""" +Dry-Run: Erzeuge Edges aus einem Vault **ohne** Qdrant-Upsert. +- Liest Markdown mit YAML-Frontmatter +- Chunking: einfacher Absatz-Chunker (Index + text) +- Kanten: nutzt app.core.edges.build_edges_for_note() +- Ausgabe: JSON pro Note mit Edge-Counts und 3 Beispiel-Payloads + +Aufruf: + python3 -m scripts.edges_dryrun --vault ./vault + +Optional: + --include-note-scope-refs # auch Frontmatter-Links (links[].target_id) als Note-Scope-Referenzen + +Voraussetzungen: + - app/core/parser.py (read_markdown, normalize_frontmatter) + - app/core/edges.py (dieses Modul forwardet zur v2-Implementierung) +""" +from __future__ import annotations + +import argparse +import json +import os +import re +from pathlib import Path +from typing import Dict, List, Optional + +from app.core.parser import read_markdown, normalize_frontmatter +from app.core.edges import build_edges_for_note + +def _iter_markdown(vault: str): + for p in Path(vault).rglob("*.md"): + if p.name.startswith("."): # ignore hidden + continue + yield p + +def _simple_chunker(body: str, note_id: str, note_type: str) -> List[Dict]: + # Absatzbasiert, minimal ausreichend für Edges (window/text, chunk_id, ord, note_id, type) + paras = [s.strip() for s in re.split(r"\n{2,}", body or "") if s.strip()] + chunks = [] + for i, t in enumerate(paras): + chunks.append({ + "chunk_id": f"{note_id}#c{i:04d}", + "ord": i, + "text": t, + "note_id": note_id, + "type": note_type, + }) + return chunks + +def _fm_note_refs(fm: Dict) -> List[str]: + out = [] + links = fm.get("links") or [] + if isinstance(links, list): + for e in links: + if isinstance(e, dict): + tid = e.get("target_id") + if isinstance(tid, str) and tid.strip(): + out.append(tid.strip()) + return out + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True) + ap.add_argument("--include-note-scope-refs", action="store_true") + args = ap.parse_args() + + vault = args.vault + include_note_scope = args.include_note_scope_refs + + report = [] + for p in _iter_markdown(vault): + parsed = read_markdown(str(p)) + if not parsed: + continue + fm = normalize_frontmatter(parsed.frontmatter or {}) + note_id = fm.get("id") or p.stem + note_type = (fm.get("type") or "concept").lower() + chunks = _simple_chunker(parsed.body, note_id, note_type) + note_refs = _fm_note_refs(fm) + + edges = build_edges_for_note( + note_id=note_id, + chunk_payloads=chunks, + note_level_refs=note_refs, + include_note_scope_refs=include_note_scope, + ) + kinds = {} + for e in edges: + key = (e.get("relation") or e.get("kind") or "edge") + kinds[key] = kinds.get(key, 0) + 1 + report.append({ + "path": str(p), + "note_id": note_id, + "type": note_type, + "chunks": len(chunks), + "edges_total": len(edges), + "edges_by_kind": kinds, + "samples": edges[:3], + }) + + print(json.dumps(report, ensure_ascii=False, indent=2)) + +if __name__ == "__main__": + main()