scripts/audit_chunks.py hinzugefügt
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
This commit is contained in:
parent
1381db6db3
commit
ecfc19ee68
67
scripts/audit_chunks.py
Normal file
67
scripts/audit_chunks.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
import argparse, os, json, glob, statistics as stats
|
||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
from app.core.chunker import assemble_chunks
|
||||
|
||||
def iter_md(root: str):
|
||||
for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True):
|
||||
pn = p.replace("\\","/")
|
||||
if any(x in pn for x in ("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
|
||||
continue
|
||||
yield p
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--vault", required=True)
|
||||
args = ap.parse_args()
|
||||
|
||||
root = os.path.abspath(args.vault)
|
||||
totals = []
|
||||
token_all = []
|
||||
issues = {"oversize": [], "broken_neighbors": [], "empty": []}
|
||||
|
||||
for path in iter_md(root):
|
||||
parsed = read_markdown(path)
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
try:
|
||||
validate_required_frontmatter(fm)
|
||||
except Exception:
|
||||
continue
|
||||
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
|
||||
totals.append(len(chunks))
|
||||
|
||||
# Checks
|
||||
prev = None
|
||||
for ch in chunks:
|
||||
token_all.append(ch.token_count)
|
||||
if ch.token_count <= 0:
|
||||
issues["empty"].append((fm["id"], ch.id))
|
||||
if prev and ch.neighbors_prev != prev.id:
|
||||
issues["broken_neighbors"].append((fm["id"], ch.id, "prev-mismatch"))
|
||||
prev = ch
|
||||
# Oversize Heuristik: > 600 Tokens (global) markieren
|
||||
for ch in chunks:
|
||||
if ch.token_count > 600:
|
||||
issues["oversize"].append((fm["id"], ch.id, ch.token_count))
|
||||
|
||||
summary = {
|
||||
"notes": len(totals),
|
||||
"chunks_total": sum(totals),
|
||||
"chunks_per_note_avg": round(sum(totals)/max(1,len(totals)),2),
|
||||
"tokens_avg": round(stats.mean(token_all),1) if token_all else 0,
|
||||
"tokens_p95": (sorted(token_all)[int(0.95*len(token_all))] if token_all else 0),
|
||||
"issues_counts": {k: len(v) for k,v in issues.items()}
|
||||
}
|
||||
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
||||
# Optional: Liste der Issues ausgeben
|
||||
for k, lst in issues.items():
|
||||
if lst:
|
||||
print(f"\n{k.upper()} ({len(lst)}):")
|
||||
for item in lst[:20]:
|
||||
print(" -", item)
|
||||
if len(lst) > 20:
|
||||
print(f" … (+{len(lst)-20} weitere)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user