From a9b4643352aea013d69662b3473a47b545a44619 Mon Sep 17 00:00:00 2001 From: Lars Date: Wed, 3 Sep 2025 07:16:09 +0200 Subject: [PATCH] =?UTF-8?q?scripts/preview=5Fchunks.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/preview_chunks.py | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 scripts/preview_chunks.py diff --git a/scripts/preview_chunks.py b/scripts/preview_chunks.py new file mode 100644 index 0000000..9046d2a --- /dev/null +++ b/scripts/preview_chunks.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +from __future__ import annotations +import argparse, os, glob, json +from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter +from app.core.chunker import assemble_chunks +from app.core.chunk_payload import make_chunk_payloads +from app.core.note_payload import make_note_payload + +def iter_md(root: str) -> list[str]: + return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True) + ap.add_argument("--note-id", help="Optional: nur eine Note (Frontmatter id) verarbeiten") + args = ap.parse_args() + + vault = os.path.abspath(args.vault) + files = iter_md(vault) + for path in files: + parsed = read_markdown(path) + fm = normalize_frontmatter(parsed.frontmatter) + try: + validate_required_frontmatter(fm) + except Exception: + continue + if args.note_id and fm.get("id") != args.note_id: + continue + + # Note payload (für Metadaten) + note_pl = make_note_payload(parsed, vault_root=vault) + # Chunks bauen + chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) + + print(json.dumps({ + "note_id": fm["id"], + "title": fm["title"], + "num_chunks": len(chunk_pls), + "avg_tokens": round(sum(c["token_count"] for c in chunk_pls)/max(1,len(chunk_pls)), 1), + "chunks": [{"id": c["id"], "tokens": c["token_count"], "section": c.get("section_title"), "prev": c.get("neighbors",{}).get("prev"), "next": c.get("neighbors",{}).get("next")} for c in chunk_pls] + }, ensure_ascii=False)) +if __name__ == "__main__": + main()