scripts/audit_chunks.py hinzugefügt

2025-09-03 07:49:19 +02:00 · 2025-09-03 07:49:19 +02:00 · ecfc19ee68
commit ecfc19ee68
parent 1381db6db3
1 changed files with 67 additions and 0 deletions
--- a/scripts/audit_chunks.py
+++ b/scripts/audit_chunks.py
@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse, os, json, glob, statistics as stats
+from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
+from app.core.chunker import assemble_chunks
+
+def iter_md(root: str):
+    for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True):
+        pn = p.replace("\\","/")
+        if any(x in pn for x in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
+            continue
+        yield p
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--vault", required=True)
+    args = ap.parse_args()
+
+    root = os.path.abspath(args.vault)
+    totals = []
+    token_all = []
+    issues = {"oversize": [], "broken_neighbors": [], "empty": []}
+
+    for path in iter_md(root):
+        parsed = read_markdown(path)
+        fm = normalize_frontmatter(parsed.frontmatter)
+        try:
+            validate_required_frontmatter(fm)
+        except Exception:
+            continue
+        chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
+        totals.append(len(chunks))
+
+        # Checks
+        prev = None
+        for ch in chunks:
+            token_all.append(ch.token_count)
+            if ch.token_count <= 0:
+                issues["empty"].append((fm["id"], ch.id))
+            if prev and ch.neighbors_prev != prev.id:
+                issues["broken_neighbors"].append((fm["id"], ch.id, "prev-mismatch"))
+            prev = ch
+        # Oversize Heuristik: > 600 Tokens (global) markieren
+        for ch in chunks:
+            if ch.token_count > 600:
+                issues["oversize"].append((fm["id"], ch.id, ch.token_count))
+
+    summary = {
+        "notes": len(totals),
+        "chunks_total": sum(totals),
+        "chunks_per_note_avg": round(sum(totals)/max(1,len(totals)),2),
+        "tokens_avg": round(stats.mean(token_all),1) if token_all else 0,
+        "tokens_p95": (sorted(token_all)[int(0.95*len(token_all))] if token_all else 0),
+        "issues_counts": {k: len(v) for k,v in issues.items()}
+    }
+    print(json.dumps(summary, ensure_ascii=False, indent=2))
+    # Optional: Liste der Issues ausgeben
+    for k, lst in issues.items():
+        if lst:
+            print(f"\n{k.upper()} ({len(lst)}):")
+            for item in lst[:20]:
+                print(" -", item)
+            if len(lst) > 20:
+                print(f" … (+{len(lst)-20} weitere)")
+
+if __name__ == "__main__":
+    main()