mindnet/scripts/payload_dryrun.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/payload_dryrun.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Zeigt die Payload-Struktur, die vor dem Datenbank-Upsert erzeugt würde.
Nützlich zur Validierung der Payload-Erzeugung und Konfiguration (types.yaml).

Funktionsweise:
---------------
1. Scannt alle Markdown-Dateien im Vault
2. Für jede Datei:
   - Parst Markdown mit Frontmatter
   - Erzeugt Note-Payload (wie in make_note_payload)
   - Erstellt Chunks via assemble_chunks
   - Erzeugt Chunk-Payloads (wie in make_chunk_payloads)
   - Optional: Erzeugt Edges (mit --with-edges)
3. Gibt JSON pro Note aus mit Payload-Zusammenfassung

Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON-Objekte (ein Objekt pro Note, eine Zeile pro Objekt)
- Jedes Objekt enthält:
  * note_id, title, type, path
  * note_payload: retriever_weight, chunk_profile
  * chunks_summary: count, first (erste 3 Chunks mit Metadaten)
  * edges_summary (nur mit --with-edges): total, by_kind

Verwendung:
-----------
- Validierung der types.yaml Konfiguration
- Debugging von Payload-Erzeugungsproblemen
- Analyse der Chunk-Struktur vor dem Import
- Prüfung von retriever_weight und chunk_profile Zuweisungen

Hinweise:
---------
- types.yaml ist maßgeblich für Payload-Erzeugung
- Frontmatter-Überschreibungen werden berücksichtigt
- Keine Datenbank-Operationen, rein analytisch
- Chunking nutzt vollständiges assemble_chunks (nicht vereinfacht)

Aufruf:
-------
python3 -m scripts.payload_dryrun --vault ./vault
python3 -m scripts.payload_dryrun --vault ./vault --note-id my-note-id
python3 -m scripts.payload_dryrun --vault ./vault --with-edges

Parameter:
----------
--vault PATH    Pfad zum Vault-Verzeichnis (erforderlich)
--note-id ID    Nur eine bestimmte Note verarbeiten (optional)
--with-edges    Edge-Erzeugung einschließen (optional)

Änderungen:
-----------
v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung
  - Entfernt: Fallback für app.core.edges
  - Verwendet direkt: app.core.derive_edges
  - Parameter korrigiert: chunk_payloads → chunks, note_level_refs → note_level_references
v1.0.0: Erster Release
"""
from __future__ import annotations
import argparse, os, json, asyncio
from typing import Any, Dict, List, Optional

from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.chunking import assemble_chunks
from app.core.ingestion.ingestion_note_payload import make_note_payload
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
from app.core.derive_edges import build_edges_for_note

async def process_file(path: str, root: str, args):
    """Verarbeitet eine einzelne Datei asynchron."""
    parsed = read_markdown(path)
    if not parsed:
        return None
    fm = normalize_frontmatter(parsed.frontmatter)
    try:
        validate_required_frontmatter(fm)
    except Exception as e:
        print(json.dumps({"path": path, "error": f"invalid frontmatter: {e}"}))
        return None

    if args.note_id and fm.get("id") != args.note_id:
        return None

    # Note-Payload exakt so, wie der Importer ihn baut (types.yaml maßgeblich)
    note_pl = make_note_payload(parsed,
                                vault_root=root,
                                hash_source="parsed",
                                hash_normalize="canonical",
                                file_path=path)

    body_text = getattr(parsed, "body", "") or ""
    chunks = await assemble_chunks(fm["id"], body_text, fm.get("type","concept"))

    chunk_note = {
        "frontmatter": fm,
        "id": fm.get("id"),
        "type": fm.get("type"),
        "title": fm.get("title"),
        "path": note_pl.get("path") or path,
        "note_id": note_pl.get("note_id"),
        "tags": fm.get("tags"),
    }
    # make_chunk_payloads bestimmt Werte ebenfalls aus types.yaml (Frontmatter wird ignoriert)
    chunk_pls = make_chunk_payloads(
        chunk_note,
        note_pl["path"],
        chunks,
        note_text=body_text,
        types_cfg=None,          # Loader aus Datei; kein Override von außen
        file_path=path,
    )

    out = {
        "note_id": note_pl.get("note_id") or fm.get("id"),
        "title": fm.get("title"),
        "type": fm.get("type"),
        "note_payload": {
            "retriever_weight": note_pl.get("retriever_weight"),
            "chunk_profile": note_pl.get("chunk_profile")
        },
        "chunks_summary": {
            "count": len(chunk_pls),
            "first": [
                {k: chunk_pls[i].get(k) for k in ("chunk_id","index","ord","retriever_weight","chunk_profile","neighbors_prev","neighbors_next")}
                for i in range(min(3, len(chunk_pls)))
            ]
        },
        "path": note_pl.get("path")
    }

    if args.with_edges:
        edges = build_edges_for_note(
            note_id=note_pl.get("note_id") or fm.get("id"),
            chunks=chunk_pls,
            note_level_references=note_pl.get("references") or [],
            include_note_scope_refs=False,
        )
        kinds = {}
        for e in edges:
            k = (e.get("relation") or e.get("kind") or "edge")
            kinds[k] = kinds.get(k, 0) + 1
        out["edges_summary"] = {"total": len(edges), "by_kind": kinds}

    return out

async def main_async():
    ap = argparse.ArgumentParser()
    ap.add_argument("--vault", required=True)
    ap.add_argument("--note-id")
    ap.add_argument("--with-edges", action="store_true")
    args = ap.parse_args()

    root = os.path.abspath(args.vault)

    files: List[str] = []
    for dp, _, fns in os.walk(root):
        for fn in fns:
            if fn.lower().endswith(".md"):
                files.append(os.path.join(dp, fn))
    files.sort()

    for path in files:
        result = await process_file(path, root, args)
        if result:
            print(json.dumps(result, ensure_ascii=False))

def main():
    asyncio.run(main_async())

if __name__ == "__main__":
    main()