from __future__ import annotations from typing import Any, Dict, Iterable, List, Optional, Union # ---- Helpers ---- def _coerce_float(val: Any) -> Optional[float]: if val is None: return None try: if isinstance(val, (int, float)): return float(val) if isinstance(val, str): v = val.strip() if not v: return None return float(v.replace(",", ".")) except Exception: return None return None def _extract_weight(frontmatter: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: if explicit is not None: return _coerce_float(explicit) if frontmatter is None: return None if "retriever_weight" in frontmatter: return _coerce_float(frontmatter.get("retriever_weight")) # also accept nested style: retriever: { weight: 0.8 } retriever = frontmatter.get("retriever") if isinstance(retriever, dict) and "weight" in retriever: return _coerce_float(retriever.get("weight")) return None def _ensure_list(x: Any) -> List[Any]: if x is None: return [] if isinstance(x, list): return x return [x] def _resolve_note_id(frontmatter: Dict[str, Any], kw_note_id: Optional[str]) -> Optional[str]: if kw_note_id: return kw_note_id if not isinstance(frontmatter, dict): return None return frontmatter.get("id") or frontmatter.get("note_id") def _base_fields(frontmatter: Dict[str, Any], note_id: Optional[str], path: str) -> Dict[str, Any]: title = None typ = None tags = None if isinstance(frontmatter, dict): title = frontmatter.get("title") typ = frontmatter.get("type") or frontmatter.get("note_type") # tags can be list[str] or comma separated string tags = frontmatter.get("tags") if isinstance(tags, str): tags = [t.strip() for t in tags.split(",") if t.strip()] return { "note_id": note_id, "title": title, "type": typ, "tags": tags, "path": path or None, } # ---- Public API ---- def make_chunk_payloads( frontmatter: Dict[str, Any], *args, note_id: Optional[str] = None, chunks: Optional[Iterable[Any]] = None, path: str = "", chunk_profile: Optional[str] = None, retriever_weight: Optional[float] = None, **kwargs, ) -> List[Dict[str, Any]]: """ Build chunk payload dictionaries for Qdrant. This function is intentionally permissive to stay compatible with older callers: - If `chunks` is a list of dictionaries that already contain payload-like fields, those are augmented. - If `chunks` is a list of strings, minimal payloads are created. - If `chunks` is a list of dicts with keys like `text`, `window`, or `index`, they are normalized. Always injects `retriever_weight` into each payload when available (from explicit arg or frontmatter). """ # Backward-compat for callers that might pass via kwargs if chunks is None: chunks = kwargs.get("payloads") or kwargs.get("pls") or kwargs.get("items") or kwargs.get("chunk_items") note_id_resolved = _resolve_note_id(frontmatter, note_id) weight = _extract_weight(frontmatter, retriever_weight) base = _base_fields(frontmatter, note_id_resolved, path) out: List[Dict[str, Any]] = [] for idx, item in enumerate(_ensure_list(chunks)): # Case A: already a full payload dict (heuristic: has 'text' or 'window' or 'note_id' keys) if isinstance(item, dict) and ("text" in item or "window" in item or "note_id" in item): pl = dict(item) # shallow copy # ensure base fields exist if missing for k, v in base.items(): pl.setdefault(k, v) # ensure chunk_index if not present pl.setdefault("chunk_index", item.get("index", idx)) # inject retriever_weight if weight is not None: pl["retriever_weight"] = weight out.append(pl) continue # Case B: item is a dict with nested 'payload' if isinstance(item, dict) and "payload" in item and isinstance(item["payload"], dict): pl = dict(item["payload"]) for k, v in base.items(): pl.setdefault(k, v) pl.setdefault("chunk_index", pl.get("index", idx)) if weight is not None: pl["retriever_weight"] = weight out.append(pl) continue # Case C: item is a plain string -> treat as text (no window context) if isinstance(item, str): text_val = item pl = { **base, "chunk_index": idx, "text": text_val, "window": text_val, } if weight is not None: pl["retriever_weight"] = weight out.append(pl) continue # Case D: item has 'text'/'window' under different names if isinstance(item, dict): text_val = item.get("text") or item.get("body") or item.get("content") or "" window_val = item.get("window") or text_val pl = { **base, "chunk_index": item.get("chunk_index", item.get("index", idx)), "text": text_val, "window": window_val, } if weight is not None: pl["retriever_weight"] = weight out.append(pl) continue # Fallback: minimal payload pl = {**base, "chunk_index": idx} if weight is not None: pl["retriever_weight"] = weight out.append(pl) return out