mindnet/scripts/preview_chunks.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/preview_chunks.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Zeigt eine Vorschau der Chunk-Struktur für Notizen im Vault.
Nützlich zur Analyse der Chunking-Strategie und Nachbarschafts-Links.

Funktionsweise:
---------------
1. Scannt alle Markdown-Dateien im Vault
2. Für jede Datei:
   - Erzeugt Note-Payload
   - Erstellt Chunks via assemble_chunks
   - Erzeugt Chunk-Payloads
3. Gibt JSON pro Note aus mit Chunk-Details

Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON-Objekte (ein Objekt pro Note, eine Zeile pro Objekt)
- Jedes Objekt enthält:
  * note_id, title
  * num_chunks: Anzahl der Chunks
  * avg_tokens: Durchschnittliche Token pro Chunk
  * chunks: Array mit Chunk-Details (id, tokens, section, prev, next)

Verwendung:
-----------
- Analyse der Chunk-Verteilung
- Validierung der Nachbarschafts-Links
- Debugging von Chunking-Problemen

Hinweise:
---------
- Nutzt synchrones assemble_chunks (kann async sein)
- Zeigt nur Struktur, keine Inhalte

Aufruf:
-------
python3 -m scripts.preview_chunks --vault ./vault
python3 -m scripts.preview_chunks --vault ./vault --note-id my-note-id

Parameter:
----------
--vault PATH    Pfad zum Vault-Verzeichnis (erforderlich)
--note-id ID    Nur eine bestimmte Note verarbeiten (optional)

Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0: Initial Release
"""
from __future__ import annotations
import argparse, os, glob, json
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.chunking import assemble_chunks
from app.core.ingestion.ingestion_note_payload import make_note_payload
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads


def iter_md(root: str) -> list[str]:
    return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--vault", required=True)
    ap.add_argument("--note-id", help="Optional: nur eine Note (Frontmatter id) verarbeiten")
    args = ap.parse_args()

    vault = os.path.abspath(args.vault)
    files = iter_md(vault)
    for path in files:
        parsed = read_markdown(path)
        fm = normalize_frontmatter(parsed.frontmatter)
        try:
            validate_required_frontmatter(fm)
        except Exception:
            continue
        if args.note_id and fm.get("id") != args.note_id:
            continue

        # Note payload (für Metadaten)
        note_pl = make_note_payload(parsed, vault_root=vault)
        # Chunks bauen
        chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
        chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)

        print(json.dumps({
            "note_id": fm["id"],
            "title": fm["title"],
            "num_chunks": len(chunk_pls),
            "avg_tokens": round(sum(c["token_count"] for c in chunk_pls)/max(1,len(chunk_pls)), 1),
            "chunks": [{"id": c["id"], "tokens": c["token_count"], "section": c.get("section_title"), "prev": c.get("neighbors",{}).get("prev"), "next": c.get("neighbors",{}).get("next")} for c in chunk_pls]
        }, ensure_ascii=False))
if __name__ == "__main__":
    main()