diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index 1f28dd4..1806688 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,149 +1,174 @@ -""" -chunk_payload.py -Version: 1.4.2 -Description: - Builds the payloads for *chunks* of a note destined for the Qdrant "chunks" collection. - - Defensive against both dict-like and attribute-like chunk objects. - - Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults). - - Ensures "retriever_weight" is present in every chunk payload, derived from the note. - - Preserves common chunk metadata (idx, offsets, tokens, section info, etc.). - - Tolerates legacy third positional parameter. -Public API: - make_chunk_payloads(parsed_note, chunks, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__) +""" +chunk_payload.py — Mindnet core payload builder (v0.5, 2025-11-08) + +Purpose +------- +Builds a list of **JSON-serializable** payload dicts for chunks of a note to be +stored in `_chunks`. Ensures `retriever_weight` is set on every chunk. + +Public API +---------- +make_chunk_payloads(parsed_note, chunks, *args, retriever_weight=None, vault_root=None, type_defaults=None, **kwargs) -> list[dict] """ -from typing import Any, Dict, Iterable, List, Optional +from __future__ import annotations +from pathlib import Path +from typing import Any, Dict, List, Optional, Union, Mapping +import datetime, math -def _as_dict(obj: Any) -> Dict[str, Any]: - if isinstance(obj, dict): - return obj - out = {} - for key in ( - "frontmatter", "fm", "meta", - "note_id", "id", - "title", "type", "tags", "aliases", - "created", "updated", "date", - "abs_path", "path", "rel_path", - ): - if hasattr(obj, key): - out[key] = getattr(obj, key) - return out +Json = Union[None, bool, int, float, str, list, dict] -def _get(obj: Any, *keys: str, default: Any=None): - if isinstance(obj, dict): - for k in keys: - if k in obj: - return obj[k] - return default - for k in keys: - if hasattr(obj, k): - return getattr(obj, k) +# ------------------------- helpers ------------------------- + +def _is_mapping(x: Any) -> bool: + return isinstance(x, Mapping) + +def _get(obj: Any, *names: str, default: Any=None) -> Any: + for n in names: + if hasattr(obj, n): + try: + return getattr(obj, n) + except Exception: + pass + if _is_mapping(obj) and n in obj: + try: + return obj[n] + except Exception: + pass return default -def _as_list(val): - if val is None: +def _to_float(x: Any, default: float=1.0) -> float: + if x is None: + return float(default) + if isinstance(x, (int, float)) and math.isfinite(x): + return float(x) + try: + s = str(x).strip().replace(',', '.') + return float(s) + except Exception: + return float(default) + +def _ensure_list(x: Any) -> list: + if x is None: + return [] + if isinstance(x, list): + return x + if isinstance(x, (set, tuple)): + return list(x) + return [x] + +def _sanitize(obj: Any) -> Json: + if obj is None or isinstance(obj, (bool, int, float, str)): + return obj + if callable(obj): return None - if isinstance(val, (list, tuple)): - return list(val) - if isinstance(val, str): - return [val] - try: - return list(val) - except Exception: - return [val] - -def _coerce_float(val: Any, default: float) -> float: - if val is None: - return float(default) - try: - return float(val) - except Exception: - return float(default) - -def _clean(d: Dict[str, Any]) -> Dict[str, Any]: - return {k: v for k, v in d.items() if v is not None} - -def make_chunk_payloads( - parsed_note: Any, - chunks: Iterable[Any], - *_, # legacy extra positional parameters tolerated - retriever_weight: Optional[float] = None, - vault_root: Optional[str] = None, - type_defaults: Optional[Dict[str, Dict[str, Any]]] = None, - **__, # ignore unexpected kwargs -) -> List[Dict[str, Any]]: - nd = _as_dict(parsed_note) - fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {} - - note_id = _get(nd, "note_id", "id") or fm.get("id") - title = _get(nd, "title") or fm.get("title") - ntype = _get(nd, "type") or fm.get("type") or "concept" - - # Effective path for source reference - abs_path = _get(nd, "abs_path", "path") - rel_path = _get(nd, "rel_path") - if vault_root and abs_path and not rel_path: + if isinstance(obj, (list, tuple, set)): + return [_sanitize(v) for v in obj] + if isinstance(obj, dict): + out = {} + for k, v in obj.items(): + if callable(v): + continue + out[str(k)] = _sanitize(v) + return out + if isinstance(obj, Path): + return str(obj) + if isinstance(obj, datetime.datetime): + return obj.isoformat() + if hasattr(obj, "__str__"): try: - from pathlib import Path - rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve())) + return str(obj) except Exception: - rel_path = _get(nd, "path") or abs_path + return None + return None - # Effective chunk_profile - chunk_profile = fm.get("chunk_profile") - if not chunk_profile and type_defaults and ntype in type_defaults: - chunk_profile = type_defaults[ntype].get("chunk_profile") +def _compute_retriever_weight(explicit: Any, frontmatter: dict, type_defaults: Optional[dict], note_type: Optional[str]) -> float: + if explicit is not None: + return _to_float(explicit, 1.0) + for key in ("retriever_weight", "retriever.weight", "retrieverWeight"): + if key in frontmatter: + return _to_float(frontmatter.get(key), 1.0) + if type_defaults and note_type: + tdef = type_defaults.get(note_type) or {} + for key in ("retriever_weight", "retriever.weight", "retrieverWeight"): + if key in tdef: + return _to_float(tdef.get(key), 1.0) + return 1.0 - # Resolve retriever_weight once at note level, apply to all chunks - if retriever_weight is None: - retriever_weight = ( - fm.get("retriever_weight") - or (fm.get("retriever", {}) or {}).get("weight") - ) - if retriever_weight is None and type_defaults and ntype in type_defaults: - retriever_weight = type_defaults[ntype].get("retriever_weight") +# ------------------------- public API ------------------------- - retriever_weight = _coerce_float(retriever_weight, default=1.0) +def make_chunk_payloads(parsed_note: Any, + chunks: List[Any], + *args, + retriever_weight: Optional[float]=None, + vault_root: Optional[str]=None, + type_defaults: Optional[dict]=None, + **kwargs) -> List[Dict[str, Json]]: + """ + Build JSON-safe payloads for all chunks in a note. - out: List[Dict[str, Any]] = [] + Parameters + ---------- + parsed_note : object or dict + chunks : list of objects or dicts + Expected per-chunk fields/keys (best-effort): text, index, start/end offsets, + tokens/n_tokens, section/heading. + retriever_weight : float|None + vault_root : str|None + type_defaults : dict|None + + Returns + ------- + list[dict] suitable for Qdrant payloads + """ + fm = _get(parsed_note, "frontmatter", "fm", default={}) + if not isinstance(fm, dict): + fm = {} + + note_id = _get(parsed_note, "note_id", "id", default=fm.get("id")) + title = _get(parsed_note, "title", default=fm.get("title")) + ntype = _get(parsed_note, "type", default=fm.get("type")) + raw_path = _get(parsed_note, "path", "rel_path", "relpath", default=fm.get("path")) + chunk_profile = _get(parsed_note, "chunk_profile", "profile", default=fm.get("chunk_profile")) + tags = _ensure_list(_get(parsed_note, "tags", default=fm.get("tags"))) + + rel_path = raw_path + if raw_path and vault_root: + try: + rel_path = str(Path(raw_path)).replace(str(Path(vault_root)), "").lstrip("/\\") + except Exception: + rel_path = str(raw_path) + + rw = _compute_retriever_weight(retriever_weight, fm, type_defaults, ntype) + + out: List[Dict[str, Json]] = [] for i, ch in enumerate(chunks): - cd = ch if isinstance(ch, dict) else {} - # Basic fields with many aliases - chunk_id = _get(ch, "chunk_id", "id", default=None) - idx = _get(ch, "idx", "index", default=i) - text = _get(ch, "text", "content", "body", "chunk_text", default=None) - char_start = _get(ch, "char_start", "start", "begin", default=None) - char_end = _get(ch, "char_end", "end", "stop", default=None) - token_count = _get(ch, "token_count", "tokens", "n_tokens", default=None) - section = _get(ch, "section", "heading", default=None) - section_path = _get(ch, "section_path", "hpath", default=None) + # tolerate missing/variant fields + text = _get(ch, "text", "content", "body", "value", default="") + idx = _get(ch, "index", "idx", default=i) + start = _get(ch, "start", "start_char", "offset_start", "char_start", default=None) + end = _get(ch, "end", "end_char", "offset_end", "char_end", default=None) + tokens = _get(ch, "n_tokens", "tokens", "token_count", default=None) + section = _get(ch, "section", "section_title", "heading", default=None) + section_level = _get(ch, "section_level", "heading_level", default=None) - payload = _clean({ + payload = { "note_id": note_id, "title": title, "type": ntype, - "path": rel_path or abs_path, - "chunk_profile": chunk_profile, - "retriever_weight": retriever_weight, - - "chunk_id": chunk_id or (f"{note_id}#ch{idx}" if note_id is not None else None), - "chunk_index": idx, + "path": rel_path or raw_path, + "chunk_index": int(idx) if isinstance(idx, (int, float)) else i, "text": text, - "char_start": char_start, - "char_end": char_end, - "token_count": token_count, + "start": start, + "end": end, + "tokens": tokens, "section": section, - "section_path": section_path, - }) - - # If the chunk object carries an existing mapping of extra metadata, preserve it. - if isinstance(ch, dict): - # Avoid overwriting the fields we already normalized - extras = {k: v for k, v in ch.items() if k not in payload and v is not None} - if extras: - payload.update(extras) - - out.append(payload) + "section_level": section_level, + "chunk_profile": chunk_profile, + "tags": tags, + "retriever_weight": float(rw), + } + out.append(_sanitize(payload)) return out diff --git a/app/core/note_payload.py b/app/core/note_payload.py index e01139f..fa4d848 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,133 +1,181 @@ -""" -note_payload.py -Version: 1.4.2 -Description: - Builds the payload for a *note* document destined for the Qdrant "notes" collection. - - Defensive against both dict-like and attribute-like "ParsedNote" inputs. - - Accepts extra/legacy arguments via *args / **kwargs (e.g., vault_root, type_defaults). - - Ensures "retriever_weight" is always present in the payload (float), resolved as: - kwarg retriever_weight > frontmatter.retriever_weight > frontmatter.retriever.weight > - type_defaults[type].retriever_weight > 1.0 - - Preserves common metadata fields expected downstream. -Public API: - make_note_payload(parsed_note, *_, retriever_weight=None, vault_root=None, type_defaults=None, **__) +""" +note_payload.py — Mindnet core payload builder (v0.5, 2025-11-08) + +Purpose +------- +Builds a **JSON-serializable** payload dict for a single note to be stored in +the `_notes` collection. The function is defensive against both +attribute- and dict-like ParsedNote inputs, unknown kwargs, and ensures +`retriever_weight` is always present as a float. + +Key guarantees +-------------- +- Accepts extra positional/keyword args without error (for importer compatibility). +- Tolerant of attribute vs dict access for ParsedNote. +- Always sets 'retriever_weight' in the payload (float). +- Never includes non-serializable objects (functions, PosixPath, datetime, etc.). + +Public API +---------- +make_note_payload(parsed_note, *args, retriever_weight=None, vault_root=None, type_defaults=None, **kwargs) -> dict """ -from typing import Any, Dict, Optional +from __future__ import annotations +from pathlib import Path +from typing import Any, Dict, Optional, Union, Iterable, Mapping +import datetime, math -def _as_dict(obj: Any) -> Dict[str, Any]: - if isinstance(obj, dict): - return obj - # Try common attribute names to build a dict view - out = {} - for key in ( - "frontmatter", "fm", "meta", - "note_id", "id", - "title", "type", "tags", "aliases", - "created", "updated", "date", - "abs_path", "path", "rel_path", - ): - if hasattr(obj, key): - out[key] = getattr(obj, key) - return out +Json = Union[None, bool, int, float, str, list, dict] -def _get(obj: Any, *keys: str, default: Any=None): - """Get first existing key/attribute from obj.""" - if isinstance(obj, dict): - for k in keys: - if k in obj: - return obj[k] - return default - # attribute access - for k in keys: - if hasattr(obj, k): - return getattr(obj, k) +# ------------------------- helpers ------------------------- + +def _is_mapping(x: Any) -> bool: + return isinstance(x, Mapping) + +def _get(obj: Any, *names: str, default: Any=None) -> Any: + """Try attribute lookup, then mapping (dict) lookup, first hit wins.""" + for n in names: + if hasattr(obj, n): + try: + return getattr(obj, n) + except Exception: + pass + if _is_mapping(obj) and n in obj: + try: + return obj[n] + except Exception: + pass return default -def _as_list(val): - if val is None: +def _to_float(x: Any, default: float=1.0) -> float: + if x is None: + return float(default) + if isinstance(x, (int, float)) and math.isfinite(x): + return float(x) + try: + s = str(x).strip().replace(',', '.') + return float(s) + except Exception: + return float(default) + +def _ensure_list(x: Any) -> list: + if x is None: + return [] + if isinstance(x, list): + return x + if isinstance(x, (set, tuple)): + return list(x) + return [x] + +def _sanitize(obj: Any) -> Json: + """Recursively convert to JSON-serializable primitives; drop callables.""" + if obj is None or isinstance(obj, (bool, int, float, str)): + return obj + if callable(obj): return None - if isinstance(val, (list, tuple)): - return list(val) - if isinstance(val, str): - return [val] - try: - return list(val) # best-effort - except Exception: - return [val] - -def _coerce_float(val: Any, default: float) -> float: - if val is None: - return float(default) - try: - return float(val) - except Exception: - return float(default) - -def _clean(d: Dict[str, Any]) -> Dict[str, Any]: - return {k: v for k, v in d.items() if v is not None} - -def make_note_payload( - parsed_note: Any, - *_, # ignore legacy extra positional args for backward compatibility - retriever_weight: Optional[float] = None, - vault_root: Optional[str] = None, - type_defaults: Optional[Dict[str, Dict[str, Any]]] = None, - **__, # ignore any unexpected kwargs -) -> Dict[str, Any]: - nd = _as_dict(parsed_note) - fm = _get(nd, "frontmatter", "fm", "meta", default={}) or {} - - note_id = _get(nd, "note_id", "id") or fm.get("id") - title = _get(nd, "title") or fm.get("title") - ntype = _get(nd, "type") or fm.get("type") or "concept" - - # Path handling - abs_path = _get(nd, "abs_path", "path") - rel_path = _get(nd, "rel_path") - if vault_root and abs_path and not rel_path: + if isinstance(obj, (list, tuple, set)): + return [_sanitize(v) for v in obj] + if isinstance(obj, dict): + out = {} + for k, v in obj.items(): + if callable(v): + continue + out[str(k)] = _sanitize(v) + return out + if isinstance(obj, Path): + return str(obj) + if isinstance(obj, datetime.datetime): + return obj.isoformat() + if hasattr(obj, "__str__"): try: - from pathlib import Path - rel_path = str(Path(abs_path).resolve().relative_to(Path(vault_root).resolve())) + return str(obj) except Exception: - rel_path = _get(nd, "path") or abs_path + return None + return None - # Tags / aliases - tags = _as_list(_get(nd, "tags") or fm.get("tags")) - aliases = _as_list(_get(nd, "aliases") or fm.get("aliases")) +def _compute_retriever_weight(explicit: Any, frontmatter: dict, type_defaults: Optional[dict], note_type: Optional[str]) -> float: + if explicit is not None: + return _to_float(explicit, 1.0) + # common frontmatter keys + for key in ("retriever_weight", "retriever.weight", "retrieverWeight"): + if key in frontmatter: + return _to_float(frontmatter.get(key), 1.0) + # type defaults map like: {"concept": {"retriever_weight": 0.9}, ...} + if type_defaults and note_type: + tdef = type_defaults.get(note_type) or {} + for key in ("retriever_weight", "retriever.weight", "retrieverWeight"): + if key in tdef: + return _to_float(tdef.get(key), 1.0) + return 1.0 - # Created/Updated - created = _get(nd, "created", "date") or fm.get("created") or fm.get("date") - updated = _get(nd, "updated") or fm.get("updated") +# ------------------------- public API ------------------------- - # Chunk profile (effective) - chunk_profile = fm.get("chunk_profile") - if not chunk_profile and type_defaults and ntype in type_defaults: - chunk_profile = type_defaults[ntype].get("chunk_profile") +def make_note_payload(parsed_note: Any, + *args, + retriever_weight: Optional[float]=None, + vault_root: Optional[str]=None, + type_defaults: Optional[dict]=None, + **kwargs) -> Dict[str, Json]: + """ + Build a JSON-safe payload for the note. - # Retriever weight resolution (ensures it is present) - if retriever_weight is None: - retriever_weight = ( - fm.get("retriever_weight") - or (fm.get("retriever", {}) or {}).get("weight") - ) - if retriever_weight is None and type_defaults and ntype in type_defaults: - retriever_weight = type_defaults[ntype].get("retriever_weight") + Parameters (tolerant; unknown args are ignored) + ---------- + parsed_note : object or dict + Expected fields/keys (best-effort): note_id/id, title, type, path/rel_path, + frontmatter, tags, aliases, chunk_profile. + retriever_weight : float|None + Overrides frontmatter/type-defaults if provided. + vault_root : str|None + Optional; used to produce a normalized relative path. + type_defaults : dict|None + Optional map for per-type defaults. - retriever_weight = _coerce_float(retriever_weight, default=1.0) + Returns + ------- + dict suitable for Qdrant payload + """ + fm = _get(parsed_note, "frontmatter", "fm", default={}) + if not isinstance(fm, dict): + fm = {} - payload = _clean({ + note_id = _get(parsed_note, "note_id", "id", default=fm.get("id")) + title = _get(parsed_note, "title", default=fm.get("title")) + ntype = _get(parsed_note, "type", default=fm.get("type")) + raw_path = _get(parsed_note, "path", "rel_path", "relpath", default=fm.get("path")) + tags = _ensure_list(_get(parsed_note, "tags", default=fm.get("tags"))) + aliases = _ensure_list(_get(parsed_note, "aliases", default=fm.get("aliases"))) + chunk_profile = _get(parsed_note, "chunk_profile", "profile", default=fm.get("chunk_profile")) + created = _get(parsed_note, "created", default=fm.get("created")) + updated = _get(parsed_note, "updated", default=fm.get("updated")) + + # normalize path relative to vault root if both available + rel_path = raw_path + if raw_path and vault_root: + try: + rel_path = str(Path(raw_path)).replace(str(Path(vault_root)), "").lstrip("/\\") + except Exception: + rel_path = str(raw_path) + + rw = _compute_retriever_weight(retriever_weight, fm, type_defaults, ntype) + + payload = { "note_id": note_id, - "id": note_id, # keep both, many downstream tools expect 'id' "title": title, "type": ntype, + "path": rel_path or raw_path, "tags": tags, "aliases": aliases, + "chunk_profile": chunk_profile, "created": created, "updated": updated, - "path": rel_path or abs_path, - "chunk_profile": chunk_profile, - "retriever_weight": retriever_weight, - }) - return payload + "retriever_weight": float(rw), + } + + # Add selected FM fields if present (safe subset) + for key in ("status", "priority", "owner", "source"): + if key in fm: + payload[key] = fm.get(key) + + return _sanitize(payload)