mindnet/scripts/dump_note_chunks.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/dump_note_chunks.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Gibt die Chunks einer bestimmten Note in lesbarer Form aus.
Nützlich zur Analyse der Chunk-Struktur und -Inhalte.

Funktionsweise:
---------------
1. Sucht Note nach note-id im Vault
2. Erzeugt Chunks via assemble_chunks
3. Gibt Chunks formatiert aus:
   - Chunk-ID, Token-Anzahl, Section-Pfad
   - Vollständiger Chunk-Text

Ergebnis-Interpretation:
------------------------
- Ausgabe: Formatierter Text
  * Header: "# Titel (note-id) — X chunks"
  * Pro Chunk: "--- chunk_id | tokens | section_path ---"
  * Gefolgt vom Chunk-Text
- Exit-Code 0: Erfolgreich
- Fehlermeldung, wenn Note nicht gefunden

Verwendung:
-----------
- Analyse der Chunk-Struktur einer spezifischen Note
- Debugging von Chunking-Problemen
- Validierung der Chunk-Inhalte

Hinweise:
---------
- Nutzt synchrones assemble_chunks (kann async sein)
- Gibt nur erste gefundene Note aus (bei Duplikaten)

Aufruf:
-------
python3 -m scripts.dump_note_chunks --vault ./vault --note-id my-note-id

Parameter:
----------
--vault PATH    Pfad zum Vault-Verzeichnis (erforderlich)
--note-id ID    Note-ID zum Dumpen (erforderlich)

Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0: Initial Release
"""
from __future__ import annotations
import argparse, os, glob
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.chunking import assemble_chunks

def iter_md(root: str):
    return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--vault", required=True)
    ap.add_argument("--note-id", required=True)
    args = ap.parse_args()

    root = os.path.abspath(args.vault)
    for path in iter_md(root):
        parsed = read_markdown(path)
        fm = normalize_frontmatter(parsed.frontmatter)
        try:
            validate_required_frontmatter(fm)
        except Exception:
            continue
        if fm.get("id") != args.note_id:
            continue

        chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type","concept"))
        print(f"# {fm['title']} ({fm['id']}) — {len(chunks)} chunks\n")
        for ch in chunks:
            print(f"--- {ch.id} | {ch.token_count} tok | {ch.section_path} ---")
            print(ch.text.strip())
            print()
        break
    else:
        print("Note nicht gefunden oder Frontmatter unvollständig.")

if __name__ == "__main__":
    main()