All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
181 lines
5.9 KiB
Python
181 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
FILE: scripts/payload_dryrun.py
|
|
VERSION: 2.1.0 (2025-12-15)
|
|
STATUS: Active
|
|
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
|
|
|
|
Zweck:
|
|
-------
|
|
Zeigt die Payload-Struktur, die vor dem Datenbank-Upsert erzeugt würde.
|
|
Nützlich zur Validierung der Payload-Erzeugung und Konfiguration (types.yaml).
|
|
|
|
Funktionsweise:
|
|
---------------
|
|
1. Scannt alle Markdown-Dateien im Vault
|
|
2. Für jede Datei:
|
|
- Parst Markdown mit Frontmatter
|
|
- Erzeugt Note-Payload (wie in make_note_payload)
|
|
- Erstellt Chunks via assemble_chunks
|
|
- Erzeugt Chunk-Payloads (wie in make_chunk_payloads)
|
|
- Optional: Erzeugt Edges (mit --with-edges)
|
|
3. Gibt JSON pro Note aus mit Payload-Zusammenfassung
|
|
|
|
Ergebnis-Interpretation:
|
|
------------------------
|
|
- Ausgabe: JSON-Objekte (ein Objekt pro Note, eine Zeile pro Objekt)
|
|
- Jedes Objekt enthält:
|
|
* note_id, title, type, path
|
|
* note_payload: retriever_weight, chunk_profile
|
|
* chunks_summary: count, first (erste 3 Chunks mit Metadaten)
|
|
* edges_summary (nur mit --with-edges): total, by_kind
|
|
|
|
Verwendung:
|
|
-----------
|
|
- Validierung der types.yaml Konfiguration
|
|
- Debugging von Payload-Erzeugungsproblemen
|
|
- Analyse der Chunk-Struktur vor dem Import
|
|
- Prüfung von retriever_weight und chunk_profile Zuweisungen
|
|
|
|
Hinweise:
|
|
---------
|
|
- types.yaml ist maßgeblich für Payload-Erzeugung
|
|
- Frontmatter-Überschreibungen werden berücksichtigt
|
|
- Keine Datenbank-Operationen, rein analytisch
|
|
- Chunking nutzt vollständiges assemble_chunks (nicht vereinfacht)
|
|
|
|
Aufruf:
|
|
-------
|
|
python3 -m scripts.payload_dryrun --vault ./vault
|
|
python3 -m scripts.payload_dryrun --vault ./vault --note-id my-note-id
|
|
python3 -m scripts.payload_dryrun --vault ./vault --with-edges
|
|
|
|
Parameter:
|
|
----------
|
|
--vault PATH Pfad zum Vault-Verzeichnis (erforderlich)
|
|
--note-id ID Nur eine bestimmte Note verarbeiten (optional)
|
|
--with-edges Edge-Erzeugung einschließen (optional)
|
|
|
|
Änderungen:
|
|
-----------
|
|
v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung
|
|
- Entfernt: Fallback für app.core.edges
|
|
- Verwendet direkt: app.core.derive_edges
|
|
- Parameter korrigiert: chunk_payloads → chunks, note_level_refs → note_level_references
|
|
v1.0.0: Erster Release
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse, os, json, asyncio
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
|
from app.core.chunking import assemble_chunks
|
|
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
|
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
|
|
from app.core.derive_edges import build_edges_for_note
|
|
|
|
async def process_file(path: str, root: str, args):
|
|
"""Verarbeitet eine einzelne Datei asynchron."""
|
|
parsed = read_markdown(path)
|
|
if not parsed:
|
|
return None
|
|
fm = normalize_frontmatter(parsed.frontmatter)
|
|
try:
|
|
validate_required_frontmatter(fm)
|
|
except Exception as e:
|
|
print(json.dumps({"path": path, "error": f"invalid frontmatter: {e}"}))
|
|
return None
|
|
|
|
if args.note_id and fm.get("id") != args.note_id:
|
|
return None
|
|
|
|
# Note-Payload exakt so, wie der Importer ihn baut (types.yaml maßgeblich)
|
|
note_pl = make_note_payload(parsed,
|
|
vault_root=root,
|
|
hash_source="parsed",
|
|
hash_normalize="canonical",
|
|
file_path=path)
|
|
|
|
body_text = getattr(parsed, "body", "") or ""
|
|
chunks = await assemble_chunks(fm["id"], body_text, fm.get("type","concept"))
|
|
|
|
chunk_note = {
|
|
"frontmatter": fm,
|
|
"id": fm.get("id"),
|
|
"type": fm.get("type"),
|
|
"title": fm.get("title"),
|
|
"path": note_pl.get("path") or path,
|
|
"note_id": note_pl.get("note_id"),
|
|
"tags": fm.get("tags"),
|
|
}
|
|
# make_chunk_payloads bestimmt Werte ebenfalls aus types.yaml (Frontmatter wird ignoriert)
|
|
chunk_pls = make_chunk_payloads(
|
|
chunk_note,
|
|
note_pl["path"],
|
|
chunks,
|
|
note_text=body_text,
|
|
types_cfg=None, # Loader aus Datei; kein Override von außen
|
|
file_path=path,
|
|
)
|
|
|
|
out = {
|
|
"note_id": note_pl.get("note_id") or fm.get("id"),
|
|
"title": fm.get("title"),
|
|
"type": fm.get("type"),
|
|
"note_payload": {
|
|
"retriever_weight": note_pl.get("retriever_weight"),
|
|
"chunk_profile": note_pl.get("chunk_profile")
|
|
},
|
|
"chunks_summary": {
|
|
"count": len(chunk_pls),
|
|
"first": [
|
|
{k: chunk_pls[i].get(k) for k in ("chunk_id","index","ord","retriever_weight","chunk_profile","neighbors_prev","neighbors_next")}
|
|
for i in range(min(3, len(chunk_pls)))
|
|
]
|
|
},
|
|
"path": note_pl.get("path")
|
|
}
|
|
|
|
if args.with_edges:
|
|
edges = build_edges_for_note(
|
|
note_id=note_pl.get("note_id") or fm.get("id"),
|
|
chunks=chunk_pls,
|
|
note_level_references=note_pl.get("references") or [],
|
|
include_note_scope_refs=False,
|
|
)
|
|
kinds = {}
|
|
for e in edges:
|
|
k = (e.get("relation") or e.get("kind") or "edge")
|
|
kinds[k] = kinds.get(k, 0) + 1
|
|
out["edges_summary"] = {"total": len(edges), "by_kind": kinds}
|
|
|
|
return out
|
|
|
|
async def main_async():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--vault", required=True)
|
|
ap.add_argument("--note-id")
|
|
ap.add_argument("--with-edges", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
root = os.path.abspath(args.vault)
|
|
|
|
files: List[str] = []
|
|
for dp, _, fns in os.walk(root):
|
|
for fn in fns:
|
|
if fn.lower().endswith(".md"):
|
|
files.append(os.path.join(dp, fn))
|
|
files.sort()
|
|
|
|
for path in files:
|
|
result = await process_file(path, root, args)
|
|
if result:
|
|
print(json.dumps(result, ensure_ascii=False))
|
|
|
|
def main():
|
|
asyncio.run(main_async())
|
|
|
|
if __name__ == "__main__":
|
|
main()
|