From a97f757e3461994b825646cec40d2222309ebd13 Mon Sep 17 00:00:00 2001 From: Lars Date: Sun, 16 Nov 2025 20:58:27 +0100 Subject: [PATCH] Dateien nach "scripts" hochladen --- scripts/payload_dryrun.py | 165 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 scripts/payload_dryrun.py diff --git a/scripts/payload_dryrun.py b/scripts/payload_dryrun.py new file mode 100644 index 0000000..9ad6e31 --- /dev/null +++ b/scripts/payload_dryrun.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +scripts/payload_dryrun.py +(see docstring inside for usage) +""" +from __future__ import annotations +import argparse, os, json, yaml, re +from typing import Any, Dict, List, Optional +from pathlib import Path + +from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter +from app.core.note_payload import make_note_payload +from app.core.chunker import assemble_chunks +from app.core.chunk_payload import make_chunk_payloads +try: + from app.core.derive_edges import build_edges_for_note +except Exception: + from app.core.edges import build_edges_for_note # type: ignore + +def _env(n: str, d: Optional[str]=None) -> str: + v = os.getenv(n) + return v if v is not None else (d or "") + +def load_types() -> dict: + p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") + try: + with open(p, "r", encoding="utf-8") as f: + import yaml + return yaml.safe_load(f) or {} + except Exception: + return {} + +def _deep_get(root: Any, path: str) -> Any: + cur = root + for key in path.split("."): + if not isinstance(cur, dict) or key not in cur: + return None + cur = cur[key] + return cur + +def eff_chunk_profile(note_type: str, fm: Dict[str, Any], reg: dict) -> Optional[str]: + if isinstance(fm.get("chunk_profile"), str): + return fm["chunk_profile"] + types = reg.get("types") if isinstance(reg.get("types"), dict) else reg + if isinstance(types, dict): + tp = types.get(note_type, {}) + if isinstance(tp, dict) and isinstance(tp.get("chunk_profile"), str): + return tp["chunk_profile"] + return None + +def eff_retriever_weight(note_type: str, fm: Dict[str, Any], reg: dict) -> float: + if fm.get("retriever_weight") is not None: + try: return float(fm["retriever_weight"]) + except Exception: pass + types = reg.get("types") if isinstance(reg.get("types"), dict) else reg + for path in [f"{note_type}.retriever_weight", f"{note_type}.retriever.weight", f"{note_type}.retrieval.weight", + "defaults.retriever_weight", "defaults.retriever.weight", "global.retriever_weight", "global.retriever.weight"]: + val = _deep_get(types, path) if "." in path else (types.get(path) if isinstance(types, dict) else None) + if val is None and isinstance(reg, dict): + val = _deep_get(reg, f"types.{path}") + try: + v = float(val) + return v + except Exception: + pass + return 1.0 + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True) + ap.add_argument("--note-id") + ap.add_argument("--with-edges", action="store_true") + args = ap.parse_args() + + reg = load_types() + root = os.path.abspath(args.vault) + + files: List[str] = [] + for dp, _, fns in os.walk(root): + for fn in fns: + if fn.lower().endswith(".md"): + files.append(os.path.join(dp, fn)) + files.sort() + + for path in files: + parsed = read_markdown(path) + if not parsed: + continue + fm = normalize_frontmatter(parsed.frontmatter) + try: + validate_required_frontmatter(fm) + except Exception as e: + print(json.dumps({"path": path, "error": f"invalid frontmatter: {e}"})) + continue + + if args.note_id and fm.get("id") != args.note_id: + continue + + # Note payload + note_pl = make_note_payload(parsed, vault_root=root, hash_mode="body", hash_normalize="canonical", hash_source="parsed", file_path=path) + + note_type = fm.get("type") or "concept" + cp = eff_chunk_profile(note_type, fm, reg) + rw = eff_retriever_weight(note_type, fm, reg) + + # Das macht der Importer ebenfalls: explizite Spiegelung in Note-Payload + if cp is not None: + note_pl["chunk_profile"] = cp + note_pl["retriever_weight"] = rw + + body_text = getattr(parsed, "body", "") or "" + chunks = assemble_chunks(fm["id"], body_text, fm.get("type","concept")) + + chunk_note = { + "frontmatter": fm, + "id": fm.get("id"), + "type": fm.get("type"), + "title": fm.get("title"), + "path": note_pl.get("path") or path, + "note_id": note_pl.get("note_id"), + "tags": fm.get("tags"), + } + chunk_pls = make_chunk_payloads( + chunk_note, + note_pl["path"], + chunks, + note_text=body_text, + types_cfg=(reg.get("types") if isinstance(reg, dict) and isinstance(reg.get("types"), dict) else reg if isinstance(reg, dict) else {}), + file_path=path, + ) + + out = { + "note_id": note_pl.get("note_id") or fm.get("id"), + "title": fm.get("title"), + "type": fm.get("type"), + "resolved": {"retriever_weight": rw, "chunk_profile": cp}, + "note_payload": {k: note_pl.get(k) for k in ("retriever_weight","chunk_profile")}, + "chunks_summary": { + "count": len(chunk_pls), + "first": [ + {k: chunk_pls[i].get(k) for k in ("chunk_id","index","ord","retriever_weight","chunk_profile","neighbors_prev","neighbors_next")} + for i in range(min(3, len(chunk_pls))) + ] + }, + "path": note_pl.get("path") + } + + if args.with_edges: + edges = build_edges_for_note( + note_id=note_pl.get("note_id") or fm.get("id"), + chunk_payloads=chunk_pls, + note_level_refs=note_pl.get("references") or [], + include_note_scope_refs=False, + ) + kinds = {} + for e in edges: + k = (e.get("relation") or e.get("kind") or "edge") + kinds[k] = kinds.get(k, 0) + 1 + out["edges_summary"] = {"total": len(edges), "by_kind": kinds} + + print(json.dumps(out, ensure_ascii=False)) + +if __name__ == "__main__": + main()