""" chunk_payload.py — Mindnet core payload builder (v0.5, 2025-11-08) Purpose ------- Builds a list of **JSON-serializable** payload dicts for chunks of a note to be stored in `_chunks`. Ensures `retriever_weight` is set on every chunk. Public API ---------- make_chunk_payloads(parsed_note, chunks, *args, retriever_weight=None, vault_root=None, type_defaults=None, **kwargs) -> list[dict] """ from __future__ import annotations from pathlib import Path from typing import Any, Dict, List, Optional, Union, Mapping import datetime, math Json = Union[None, bool, int, float, str, list, dict] # ------------------------- helpers ------------------------- def _is_mapping(x: Any) -> bool: return isinstance(x, Mapping) def _get(obj: Any, *names: str, default: Any=None) -> Any: for n in names: if hasattr(obj, n): try: return getattr(obj, n) except Exception: pass if _is_mapping(obj) and n in obj: try: return obj[n] except Exception: pass return default def _to_float(x: Any, default: float=1.0) -> float: if x is None: return float(default) if isinstance(x, (int, float)) and math.isfinite(x): return float(x) try: s = str(x).strip().replace(',', '.') return float(s) except Exception: return float(default) def _ensure_list(x: Any) -> list: if x is None: return [] if isinstance(x, list): return x if isinstance(x, (set, tuple)): return list(x) return [x] def _sanitize(obj: Any) -> Json: if obj is None or isinstance(obj, (bool, int, float, str)): return obj if callable(obj): return None if isinstance(obj, (list, tuple, set)): return [_sanitize(v) for v in obj] if isinstance(obj, dict): out = {} for k, v in obj.items(): if callable(v): continue out[str(k)] = _sanitize(v) return out if isinstance(obj, Path): return str(obj) if isinstance(obj, datetime.datetime): return obj.isoformat() if hasattr(obj, "__str__"): try: return str(obj) except Exception: return None return None def _compute_retriever_weight(explicit: Any, frontmatter: dict, type_defaults: Optional[dict], note_type: Optional[str]) -> float: if explicit is not None: return _to_float(explicit, 1.0) for key in ("retriever_weight", "retriever.weight", "retrieverWeight"): if key in frontmatter: return _to_float(frontmatter.get(key), 1.0) if type_defaults and note_type: tdef = type_defaults.get(note_type) or {} for key in ("retriever_weight", "retriever.weight", "retrieverWeight"): if key in tdef: return _to_float(tdef.get(key), 1.0) return 1.0 # ------------------------- public API ------------------------- def make_chunk_payloads(parsed_note: Any, chunks: List[Any], *args, retriever_weight: Optional[float]=None, vault_root: Optional[str]=None, type_defaults: Optional[dict]=None, **kwargs) -> List[Dict[str, Json]]: """ Build JSON-safe payloads for all chunks in a note. Parameters ---------- parsed_note : object or dict chunks : list of objects or dicts Expected per-chunk fields/keys (best-effort): text, index, start/end offsets, tokens/n_tokens, section/heading. retriever_weight : float|None vault_root : str|None type_defaults : dict|None Returns ------- list[dict] suitable for Qdrant payloads """ fm = _get(parsed_note, "frontmatter", "fm", default={}) if not isinstance(fm, dict): fm = {} note_id = _get(parsed_note, "note_id", "id", default=fm.get("id")) title = _get(parsed_note, "title", default=fm.get("title")) ntype = _get(parsed_note, "type", default=fm.get("type")) raw_path = _get(parsed_note, "path", "rel_path", "relpath", default=fm.get("path")) chunk_profile = _get(parsed_note, "chunk_profile", "profile", default=fm.get("chunk_profile")) tags = _ensure_list(_get(parsed_note, "tags", default=fm.get("tags"))) rel_path = raw_path if raw_path and vault_root: try: rel_path = str(Path(raw_path)).replace(str(Path(vault_root)), "").lstrip("/\\") except Exception: rel_path = str(raw_path) rw = _compute_retriever_weight(retriever_weight, fm, type_defaults, ntype) out: List[Dict[str, Json]] = [] for i, ch in enumerate(chunks): # tolerate missing/variant fields text = _get(ch, "text", "content", "body", "value", default="") idx = _get(ch, "index", "idx", default=i) start = _get(ch, "start", "start_char", "offset_start", "char_start", default=None) end = _get(ch, "end", "end_char", "offset_end", "char_end", default=None) tokens = _get(ch, "n_tokens", "tokens", "token_count", default=None) section = _get(ch, "section", "section_title", "heading", default=None) section_level = _get(ch, "section_level", "heading_level", default=None) payload = { "note_id": note_id, "title": title, "type": ntype, "path": rel_path or raw_path, "chunk_index": int(idx) if isinstance(idx, (int, float)) else i, "text": text, "start": start, "end": end, "tokens": tokens, "section": section, "section_level": section_level, "chunk_profile": chunk_profile, "tags": tags, "retriever_weight": float(rw), } out.append(_sanitize(payload)) return out