# app/core/chunk_payload.py # Version: 1.2.0 (2025-11-08) # Purpose: # Build robust Qdrant payloads for CHUNK points. # # Highlights: # - Works with dict-like chunks and simple objects; supports (text, idx) tuples. # - Accepts legacy/extra kwargs (e.g., vault_root) without failing. # - Copies canonical note fields onto each chunk (note_id/title/type/tags/path). # - Sets 'text' and 'chunk_index' per chunk. # - Reliably propagates `retriever_weight` onto every chunk if provided in # frontmatter or explicitly. # # Usage: # payloads = make_chunk_payloads(note, chunks, retriever_weight=None, base_payload=None, vault_root="/path/to/vault") # # Changelog: # 1.2.0 (2025-11-08) Accept legacy kwargs, robust getters, propagate retriever_weight. # 1.1.0 (2025-11-08) Initial robust rewrite with attribute/dict support. from __future__ import annotations from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple def _get(obj: Any, key: str, default: Any = None) -> Any: if obj is None: return default if hasattr(obj, key): try: val = getattr(obj, key) return default if val is None else val except Exception: pass if isinstance(obj, dict): if key in obj: val = obj.get(key, default) return default if val is None else val return default def _get_frontmatter(note: Any) -> Dict[str, Any]: fm = _get(note, "frontmatter", None) if isinstance(fm, dict): return fm meta = _get(note, "meta", None) if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict): return meta["frontmatter"] return {} def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any: if not isinstance(fm, dict): return default if key in fm: val = fm.get(key, default) return default if val is None else val return default def _coerce_tags(val: Any) -> List[str]: if val is None: return [] if isinstance(val, list): return [str(x) for x in val] if isinstance(val, str): parts = [t.strip() for t in val.split(",")] return [p for p in parts if p] return [] def _resolve_retriever_weight(fm: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: if explicit is not None: try: return float(explicit) except Exception: return None val = _get_from_frontmatter(fm, "retriever_weight", None) if isinstance(val, (int, float)): return float(val) retr = fm.get("retriever") if isinstance(retr, dict): v = retr.get("weight") if isinstance(v, (int, float)): return float(v) return None def _resolve_path(note: Any, fm: Dict[str, Any], vault_root: Optional[str]) -> Optional[str]: path = _get_from_frontmatter(fm, "path", None) if path is None: path = _get(note, "path", None) or _get(note, "source", None) or _get(note, "filepath", None) if path is None: return None try: if vault_root: vr = Path(vault_root) rel = Path(path) try: return str(rel.relative_to(vr)) except Exception: return str(rel) except Exception: pass return str(path) def _resolve_note_fields(note: Any, vault_root: Optional[str]) -> Dict[str, Any]: fm = _get_frontmatter(note) note_id = _get_from_frontmatter(fm, "id", None) or _get(note, "note_id", None) or _get(note, "id", None) title = _get_from_frontmatter(fm, "title", None) or _get(note, "title", None) ntype = _get_from_frontmatter(fm, "type", None) or _get(note, "type", None) tags = _coerce_tags(_get_from_frontmatter(fm, "tags", None) or _get(note, "tags", None)) path = _resolve_path(note, fm, vault_root) return {"note_id": note_id, "title": title, "type": ntype, "tags": tags, "path": path, "frontmatter": fm} def _extract_chunk_text_and_index(chunk: Any, fallback_index: int) -> Tuple[str, int]: # (text, idx) tuple if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str): txt, idx = chunk try: idx_int = int(idx) except Exception: idx_int = fallback_index return txt, idx_int # string if isinstance(chunk, str): return chunk, fallback_index # dict if isinstance(chunk, dict): txt = chunk.get("text") or chunk.get("window") or chunk.get("body") or chunk.get("content") if isinstance(txt, str): idx = chunk.get("index") try: idx_int = int(idx) if idx is not None else fallback_index except Exception: idx_int = fallback_index return txt, idx_int # object with attributes for attr in ("text", "window", "body", "content"): if hasattr(chunk, attr): try: txt = getattr(chunk, attr) except Exception: txt = None if isinstance(txt, str): idx = None if hasattr(chunk, "index"): try: idx = getattr(chunk, "index") except Exception: idx = None try: idx_int = int(idx) if idx is not None else fallback_index except Exception: idx_int = fallback_index return txt, idx_int raise ValueError("Unsupported chunk format: cannot extract text/index") def make_chunk_payloads( note: Any, chunks, *, retriever_weight: Optional[float] = None, base_payload: Optional[Dict[str, Any]] = None, vault_root: Optional[str] = None, **kwargs, ) -> List[Dict[str, Any]]: """Build Qdrant payloads for chunks from a parsed note and iterable of chunks. Parameters ---------- note : Any Parsed note (dict or object with attributes). chunks : Iterable[Any] Chunks; supports str, dicts with 'text'/'window'/'body'/'content', objects with same, or (text, idx) tuples. retriever_weight : Optional[float] Optional override; if None, value is read from frontmatter. base_payload : Optional[Dict[str, Any]] Extra fields to copy onto each chunk. vault_root : Optional[str] Optional base path to compute relative 'path' if possible. **kwargs : Ignored extra options to remain compatible with callers. """ note_fields = _resolve_note_fields(note, vault_root) fm = note_fields["frontmatter"] rw = _resolve_retriever_weight(fm, retriever_weight) common: Dict[str, Any] = {} if isinstance(base_payload, dict): common.update({k: v for k, v in base_payload.items() if v is not None}) if note_fields.get("note_id") is not None: common["note_id"] = note_fields["note_id"] if note_fields.get("title") is not None: common["title"] = note_fields["title"] if note_fields.get("type") is not None: common["type"] = note_fields["type"] if note_fields.get("tags"): common["tags"] = note_fields["tags"] if note_fields.get("path") is not None: common["path"] = note_fields["path"] if rw is not None: common["retriever_weight"] = rw out: List[Dict[str, Any]] = [] for i, ch in enumerate(chunks): text, idx = _extract_chunk_text_and_index(ch, i) payload = dict(common) payload["chunk_index"] = idx payload["text"] = text out.append(payload) return out