diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index 6020f2c..e935ddf 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,218 +1,162 @@ -# app/core/chunk_payload.py -# Version: 1.2.0 (2025-11-08) -# Purpose: -# Build robust Qdrant payloads for CHUNK points. -# -# Highlights: -# - Works with dict-like chunks and simple objects; supports (text, idx) tuples. -# - Accepts legacy/extra kwargs (e.g., vault_root) without failing. -# - Copies canonical note fields onto each chunk (note_id/title/type/tags/path). -# - Sets 'text' and 'chunk_index' per chunk. -# - Reliably propagates `retriever_weight` onto every chunk if provided in -# frontmatter or explicitly. -# -# Usage: -# payloads = make_chunk_payloads(note, chunks, retriever_weight=None, base_payload=None, vault_root="/path/to/vault") -# -# Changelog: -# 1.2.0 (2025-11-08) Accept legacy kwargs, robust getters, propagate retriever_weight. -# 1.1.0 (2025-11-08) Initial robust rewrite with attribute/dict support. +""" +chunk_payload.py — mindnet core payload builders +Version: 1.3.1 (2025-11-08) +Purpose +------- +Build robust chunk payloads for Qdrant upserts. +This function is intentionally flexible about its signature to remain +compatible with different callers. + +Contract +-------- +make_chunk_payloads(note, chunks, *args, **kwargs) -> List[Dict[str, Any]] + +Each returned item contains at least: +- note_id (str) +- title (str) +- type (str) +- path (str or None) +- tags (List[str]) +- chunk_index (int) +- text (str) +- retriever_weight (float or None) # if available +""" from __future__ import annotations from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Union def _get(obj: Any, key: str, default: Any = None) -> Any: if obj is None: return default - if hasattr(obj, key): - try: - val = getattr(obj, key) - return default if val is None else val - except Exception: - pass - if isinstance(obj, dict): - if key in obj: - val = obj.get(key, default) - return default if val is None else val - return default + if isinstance(obj, Mapping): + return obj.get(key, default) + return getattr(obj, key, default) -def _get_frontmatter(note: Any) -> Dict[str, Any]: - fm = _get(note, "frontmatter", None) - if isinstance(fm, dict): +def _get_frontmatter(note: Any) -> Mapping[str, Any]: + fm = _get(note, "frontmatter", {}) + if isinstance(fm, Mapping): return fm - meta = _get(note, "meta", None) - if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict): - return meta["frontmatter"] return {} -def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any: - if not isinstance(fm, dict): - return default - if key in fm: - val = fm.get(key, default) - return default if val is None else val - return default - - -def _coerce_tags(val: Any) -> List[str]: - if val is None: - return [] - if isinstance(val, list): - return [str(x) for x in val] - if isinstance(val, str): - parts = [t.strip() for t in val.split(",")] - return [p for p in parts if p] - return [] - - -def _resolve_retriever_weight(fm: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: - if explicit is not None: +def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]: + def to_float(v: Any) -> Optional[float]: try: - return float(explicit) + if v is None: + return None + return float(v) except Exception: return None - val = _get_from_frontmatter(fm, "retriever_weight", None) - if isinstance(val, (int, float)): - return float(val) + + if explicit is not None: + return to_float(explicit) + + if "retriever_weight" in fm: + return to_float(fm.get("retriever_weight")) + retr = fm.get("retriever") - if isinstance(retr, dict): - v = retr.get("weight") - if isinstance(v, (int, float)): - return float(v) + if isinstance(retr, Mapping) and "weight" in retr: + return to_float(retr.get("weight")) + return None -def _resolve_path(note: Any, fm: Dict[str, Any], vault_root: Optional[str]) -> Optional[str]: - path = _get_from_frontmatter(fm, "path", None) - if path is None: - path = _get(note, "path", None) or _get(note, "source", None) or _get(note, "filepath", None) - if path is None: +def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]: + if abs_path is None: return None try: + p = Path(abs_path) if vault_root: - vr = Path(vault_root) - rel = Path(path) try: - return str(rel.relative_to(vr)) + rp = p.relative_to(Path(vault_root)) + return str(rp) except Exception: - return str(rel) + return str(p) + return str(p) except Exception: - pass - return str(path) + return str(abs_path) -def _resolve_note_fields(note: Any, vault_root: Optional[str]) -> Dict[str, Any]: - fm = _get_frontmatter(note) - note_id = _get_from_frontmatter(fm, "id", None) or _get(note, "note_id", None) or _get(note, "id", None) - title = _get_from_frontmatter(fm, "title", None) or _get(note, "title", None) - ntype = _get_from_frontmatter(fm, "type", None) or _get(note, "type", None) - tags = _coerce_tags(_get_from_frontmatter(fm, "tags", None) or _get(note, "tags", None)) - path = _resolve_path(note, fm, vault_root) - return {"note_id": note_id, "title": title, "type": ntype, "tags": tags, "path": path, "frontmatter": fm} +def _coerce_chunks(chunks_obj: Any) -> List[Any]: + """Accept lists of dicts/objects or generators; coerce to list safely.""" + if chunks_obj is None: + return [] + if isinstance(chunks_obj, list): + return chunks_obj + try: + return list(chunks_obj) + except Exception: + return [] -def _extract_chunk_text_and_index(chunk: Any, fallback_index: int) -> Tuple[str, int]: - # (text, idx) tuple - if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str): - txt, idx = chunk - try: - idx_int = int(idx) - except Exception: - idx_int = fallback_index - return txt, idx_int - # string - if isinstance(chunk, str): - return chunk, fallback_index - # dict - if isinstance(chunk, dict): - txt = chunk.get("text") or chunk.get("window") or chunk.get("body") or chunk.get("content") - if isinstance(txt, str): - idx = chunk.get("index") - try: - idx_int = int(idx) if idx is not None else fallback_index - except Exception: - idx_int = fallback_index - return txt, idx_int - # object with attributes - for attr in ("text", "window", "body", "content"): - if hasattr(chunk, attr): - try: - txt = getattr(chunk, attr) - except Exception: - txt = None - if isinstance(txt, str): - idx = None - if hasattr(chunk, "index"): - try: - idx = getattr(chunk, "index") - except Exception: - idx = None - try: - idx_int = int(idx) if idx is not None else fallback_index - except Exception: - idx_int = fallback_index - return txt, idx_int - raise ValueError("Unsupported chunk format: cannot extract text/index") +def _get_chunk_text(c: Any) -> str: + for key in ("text", "chunk", "body", "content"): + v = _get(c, key) + if isinstance(v, str) and v.strip(): + return v + # last resort: string repr + return str(c) if c is not None else "" def make_chunk_payloads( - note: Any, - chunks, - *, - retriever_weight: Optional[float] = None, - base_payload: Optional[Dict[str, Any]] = None, - vault_root: Optional[str] = None, - **kwargs, + *args: Any, + **kwargs: Any, ) -> List[Dict[str, Any]]: - """Build Qdrant payloads for chunks from a parsed note and iterable of chunks. - - Parameters - ---------- - note : Any - Parsed note (dict or object with attributes). - chunks : Iterable[Any] - Chunks; supports str, dicts with 'text'/'window'/'body'/'content', objects with same, or (text, idx) tuples. - retriever_weight : Optional[float] - Optional override; if None, value is read from frontmatter. - base_payload : Optional[Dict[str, Any]] - Extra fields to copy onto each chunk. - vault_root : Optional[str] - Optional base path to compute relative 'path' if possible. - **kwargs : - Ignored extra options to remain compatible with callers. """ - note_fields = _resolve_note_fields(note, vault_root) - fm = note_fields["frontmatter"] - rw = _resolve_retriever_weight(fm, retriever_weight) + Flexible signature for backward/forward compatibility. + Expected positional args: + args[0] -> note (ParsedNote or Mapping) + args[1] -> chunks (Iterable) + args[2] -> (optional) config/ignored + Recognized kwargs: + - vault_root: base path for relative paths (optional) + - retriever_weight: explicit override (optional) + """ + if not args: + raise TypeError("make_chunk_payloads(note, chunks, *_) requires at least (note, chunks).") - common: Dict[str, Any] = {} - if isinstance(base_payload, dict): - common.update({k: v for k, v in base_payload.items() if v is not None}) + note = args[0] + chunks = args[1] if len(args) > 1 else kwargs.get("chunks") + chunks_list = _coerce_chunks(chunks) - if note_fields.get("note_id") is not None: - common["note_id"] = note_fields["note_id"] - if note_fields.get("title") is not None: - common["title"] = note_fields["title"] - if note_fields.get("type") is not None: - common["type"] = note_fields["type"] - if note_fields.get("tags"): - common["tags"] = note_fields["tags"] - if note_fields.get("path") is not None: - common["path"] = note_fields["path"] - if rw is not None: - common["retriever_weight"] = rw + vault_root = kwargs.get("vault_root") + explicit_weight = kwargs.get("retriever_weight") - out: List[Dict[str, Any]] = [] - for i, ch in enumerate(chunks): - text, idx = _extract_chunk_text_and_index(ch, i) - payload = dict(common) - payload["chunk_index"] = idx - payload["text"] = text - out.append(payload) + fm = _get_frontmatter(note) - return out + note_id = _get(note, "note_id") or _get(note, "id") or fm.get("id") + title = _get(note, "title") or fm.get("title") + ntype = _get(note, "type") or fm.get("type") + tags = _get(note, "tags") or fm.get("tags") or [] + if not isinstance(tags, list): + tags = list(tags) if tags else [] + + path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path") + rweight = _resolve_retriever_weight(explicit_weight, fm) + + base = { + "note_id": note_id, + "title": title, + "type": ntype, + "tags": tags, + "path": _to_rel_path(path_val, vault_root), + "retriever_weight": rweight, + } + + payloads: List[Dict[str, Any]] = [] + for idx, ch in enumerate(chunks_list): + text = _get_chunk_text(ch) + item = dict(base) + item.update( + { + "chunk_index": idx, + "text": text, + } + ) + payloads.append(item) + + return payloads diff --git a/app/core/note_payload.py b/app/core/note_payload.py index be0a390..c2dff31 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,181 +1,139 @@ -# app/core/note_payload.py -# Version: 1.2.0 (2025-11-08) -# Purpose: -# Build robust Qdrant payloads for NOTE points. -# -# Highlights: -# - Works with both dict-like inputs and ParsedNote-like objects (attribute access). -# - Accepts legacy/extra kwargs (e.g., vault_root) without failing. -# - Copies canonical fields: id/note_id, title, type, tags, path, text (if present). -# - Reliably propagates `retriever_weight` into the payload if set in frontmatter -# (frontmatter.retriever_weight or frontmatter.retriever.weight) or provided explicitly. -# -# Backward compatibility: -# - Signature accepts **kwargs (e.g., vault_root) because some callers pass it. -# - Both 'id' and 'note_id' are written for compatibility with existing queries. -# -# Usage: -# payload = make_note_payload(parsed_note, retriever_weight=None, vault_root="/path/to/vault") -# -# Changelog: -# 1.2.0 (2025-11-08) Accept legacy kwargs, robust getters, propagate retriever_weight. -# 1.1.0 (2025-11-08) Initial robust rewrite with attribute/dict support. +""" +note_payload.py — mindnet core payload builders +Version: 1.3.1 (2025-11-08) +Purpose +------- +Build a robust, forward-compatible note payload for Qdrant upserts. +This module is intentionally defensive: +- Accepts both dict-like "parsed note" objects and dataclass/objects with attributes. +- Tolerates extra kwargs from different callers (e.g., `vault_root`, `prefix`, etc.). +- Ensures `retriever_weight` is resolved and present in the payload if available. + +Contract +-------- +make_note_payload(note, **kwargs) -> Dict[str, Any] + +Expected minimal fields in returned payload: +- note_id (str) +- title (str) +- type (str) +- path (str or None) # relative to vault_root when provided +- tags (List[str]) +- retriever_weight (float or None) # if available +""" from __future__ import annotations from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterable, List, Mapping, Optional, Union def _get(obj: Any, key: str, default: Any = None) -> Any: - """Robust getter: attribute first, then dict.""" + """Try to read `key` from mapping or attribute; else default.""" if obj is None: return default - if hasattr(obj, key): - try: - val = getattr(obj, key) - return default if val is None else val - except Exception: - pass - if isinstance(obj, dict): - if key in obj: - val = obj.get(key, default) - return default if val is None else val - return default + if isinstance(obj, Mapping): + return obj.get(key, default) + # attribute access + return getattr(obj, key, default) -def _get_frontmatter(note: Any) -> Dict[str, Any]: - fm = _get(note, "frontmatter", None) - if isinstance(fm, dict): +def _get_frontmatter(note: Any) -> Mapping[str, Any]: + fm = _get(note, "frontmatter", {}) + if isinstance(fm, Mapping): return fm - meta = _get(note, "meta", None) - if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict): - return meta["frontmatter"] - return {} + return {} # be safe -def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any: - if not isinstance(fm, dict): - return default - if key in fm: - val = fm.get(key, default) - return default if val is None else val - return default - - -def _coerce_tags(val: Any) -> List[str]: - if val is None: - return [] - if isinstance(val, list): - return [str(x) for x in val] - if isinstance(val, str): - parts = [t.strip() for t in val.split(",")] - return [p for p in parts if p] - return [] - - -def _resolve_retriever_weight(fm: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: - # 1) explicit argument wins - if explicit is not None: +def _resolve_retriever_weight(explicit: Any, fm: Mapping[str, Any]) -> Optional[float]: + """ + Priority: + 1) explicit kwarg retriever_weight + 2) frontmatter['retriever_weight'] + 3) frontmatter['retriever']['weight'] + """ + def to_float(v: Any) -> Optional[float]: try: - return float(explicit) + if v is None: + return None + return float(v) except Exception: return None - # 2) frontmatter.retriever_weight - val = _get_from_frontmatter(fm, "retriever_weight", None) - if isinstance(val, (int, float)): - return float(val) - # 3) frontmatter.retriever.weight + + if explicit is not None: + return to_float(explicit) + + if "retriever_weight" in fm: + return to_float(fm.get("retriever_weight")) + retr = fm.get("retriever") - if isinstance(retr, dict): - v = retr.get("weight") - if isinstance(v, (int, float)): - return float(v) + if isinstance(retr, Mapping) and "weight" in retr: + return to_float(retr.get("weight")) + return None -def _resolve_path(note: Any, fm: Dict[str, Any], vault_root: Optional[str]) -> Optional[str]: - """Try to determine a stable relative path for diagnostics/traceability.""" - path = _get_from_frontmatter(fm, "path", None) - if path is None: - path = _get(note, "path", None) or _get(note, "source", None) or _get(note, "filepath", None) - if path is None: +def _to_rel_path(abs_path: Optional[Union[str, Path]], vault_root: Optional[Union[str, Path]]) -> Optional[str]: + if abs_path is None: return None try: + p = Path(abs_path) if vault_root: - vr = Path(vault_root) - # Avoid Windows drive quirks: use Pure/Path consistently - rel = Path(path) try: - path_rel = str(rel.relative_to(vr)) + rp = p.relative_to(Path(vault_root)) + return str(rp) except Exception: - # If 'path' is absolute not under vault_root, just return as-is - path_rel = str(rel) - return path_rel + return str(p) + return str(p) except Exception: - pass - return str(path) + return str(abs_path) def make_note_payload( note: Any, - *, - retriever_weight: Optional[float] = None, - vault_root: Optional[str] = None, - **kwargs, + *args, # tolerate older/other callers + **kwargs: Any, ) -> Dict[str, Any]: """ - Build a Qdrant payload dict for a NOTE. + Build a normalized note payload for Qdrant. + Unknown kwargs are ignored to keep the function forward-compatible. - Parameters - ---------- - note : Any - Parsed note (dict or object with attributes). - retriever_weight : Optional[float] - Optional override; if None, value is read from frontmatter. - vault_root : Optional[str] - Optional base path to compute relative 'path' if possible. - **kwargs : - Ignored extra options to remain compatible with callers. - - Returns - ------- - Dict[str, Any] - Payload ready for Qdrant upsert. + Recognized kwargs: + - vault_root: base path to make `path` relative (optional) + - retriever_weight: explicit override (optional) """ + vault_root = kwargs.get("vault_root") + explicit_weight = kwargs.get("retriever_weight") + fm = _get_frontmatter(note) - # id / note_id - note_id = _get_from_frontmatter(fm, "id", None) or _get(note, "note_id", None) or _get(note, "id", None) - title = _get_from_frontmatter(fm, "title", None) or _get(note, "title", None) - ntype = _get_from_frontmatter(fm, "type", None) or _get(note, "type", None) - tags = _coerce_tags(_get_from_frontmatter(fm, "tags", None) or _get(note, "tags", None)) + note_id = _get(note, "note_id") or _get(note, "id") + if not note_id: + # Try from frontmatter + note_id = fm.get("id") - # Optional text for notes collection (only if present; we don't force it) - text = _get(note, "text", None) - if text is None and isinstance(note, dict): - text = note.get("body") or note.get("content") + title = _get(note, "title") or fm.get("title") + ntype = _get(note, "type") or fm.get("type") - # Path resolution - path = _resolve_path(note, fm, vault_root) + tags = _get(note, "tags") or fm.get("tags") or [] + if not isinstance(tags, list): + tags = list(tags) if tags else [] - payload: Dict[str, Any] = {} - if note_id is not None: - payload["id"] = note_id # keep for legacy queries - payload["note_id"] = note_id # canonical - if title is not None: - payload["title"] = title - if ntype is not None: - payload["type"] = ntype - if tags: - payload["tags"] = tags - if path is not None: - payload["path"] = path - if text is not None: - payload["text"] = text + path_val = _get(note, "path") or _get(note, "abs_path") or fm.get("path") - rw = _resolve_retriever_weight(fm, retriever_weight) - if rw is not None: - payload["retriever_weight"] = rw + payload: Dict[str, Any] = { + "note_id": note_id, + "title": title, + "type": ntype, + "tags": tags, + "path": _to_rel_path(path_val, vault_root), + "retriever_weight": _resolve_retriever_weight(explicit_weight, fm), + } + + # Also surface explicit frontmatter fields (non-conflicting) if present + for k in ("status", "created", "updated"): + v = fm.get(k) + if v is not None and k not in payload: + payload[k] = v return payload