From ecfc19ee680ec5ade43b8f595d4e33953db428a1 Mon Sep 17 00:00:00 2001 From: Lars Date: Wed, 3 Sep 2025 07:49:19 +0200 Subject: [PATCH] =?UTF-8?q?scripts/audit=5Fchunks.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/audit_chunks.py | 67 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 scripts/audit_chunks.py diff --git a/scripts/audit_chunks.py b/scripts/audit_chunks.py new file mode 100644 index 0000000..6311141 --- /dev/null +++ b/scripts/audit_chunks.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +from __future__ import annotations +import argparse, os, json, glob, statistics as stats +from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter +from app.core.chunker import assemble_chunks + +def iter_md(root: str): + for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True): + pn = p.replace("\\","/") + if any(x in pn for x in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")): + continue + yield p + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True) + args = ap.parse_args() + + root = os.path.abspath(args.vault) + totals = [] + token_all = [] + issues = {"oversize": [], "broken_neighbors": [], "empty": []} + + for path in iter_md(root): + parsed = read_markdown(path) + fm = normalize_frontmatter(parsed.frontmatter) + try: + validate_required_frontmatter(fm) + except Exception: + continue + chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) + totals.append(len(chunks)) + + # Checks + prev = None + for ch in chunks: + token_all.append(ch.token_count) + if ch.token_count <= 0: + issues["empty"].append((fm["id"], ch.id)) + if prev and ch.neighbors_prev != prev.id: + issues["broken_neighbors"].append((fm["id"], ch.id, "prev-mismatch")) + prev = ch + # Oversize Heuristik: > 600 Tokens (global) markieren + for ch in chunks: + if ch.token_count > 600: + issues["oversize"].append((fm["id"], ch.id, ch.token_count)) + + summary = { + "notes": len(totals), + "chunks_total": sum(totals), + "chunks_per_note_avg": round(sum(totals)/max(1,len(totals)),2), + "tokens_avg": round(stats.mean(token_all),1) if token_all else 0, + "tokens_p95": (sorted(token_all)[int(0.95*len(token_all))] if token_all else 0), + "issues_counts": {k: len(v) for k,v in issues.items()} + } + print(json.dumps(summary, ensure_ascii=False, indent=2)) + # Optional: Liste der Issues ausgeben + for k, lst in issues.items(): + if lst: + print(f"\n{k.upper()} ({len(lst)}):") + for item in lst[:20]: + print(" -", item) + if len(lst) > 20: + print(f" … (+{len(lst)-20} weitere)") + +if __name__ == "__main__": + main()