diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index fdeb80a..e5b04d3 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,144 +1,180 @@ """ -chunk_payload.py — Mindnet payload helpers -Version: 0.5.2 (generated 2025-11-08 21:03:48) -Purpose: - - Build CHUNK payloads list while preserving existing chunk fields (text, seq, etc.). - - Inject into *every* chunk: - * retriever_weight (resolved like note payload) - * chunk_profile (resolved like note payload) -Resolution order identical to note_payload.make_note_payload. -Signature tolerant to match existing importers. +chunk_payload.py — Mindnet payload builder (Chunks) +Version: 1.3.0 (2025-11-09) + +Purpose +------- +Build Qdrant-compatible JSON payloads for *chunks* of a parsed note. +Tolerant to different call signatures and accepts both dict-like and object-like inputs. + +Key features +------------ +- Reads type defaults from `config/config.yaml` or `config/types.yaml` (same schema). +- Resolves fields with precedence: + Frontmatter > type-defaults > ENV > fallback. +- Sets per chunk: + * `note_id`, `note_title`, `type` + * `retriever_weight` (float) + * `chunk_profile` (short|medium|long) + * `text` (never empty: falls back to whole note body/text) + * `order`, `section`, `start`, `end` (if available) +- Backwards-compatible signature: accepts **kwargs to swallow unknown args. + +Input +----- +`parsed_note` may be: + - dict with keys: id, title, body/text, chunks(list), frontmatter(dict), type + - object with equivalent attributes + +Each chunk may be dict-like or object-like with keys/attrs such as: + id, text, order, section, start, end """ from __future__ import annotations -from typing import Any, Dict, List, Optional, Union -from pathlib import Path + import os +from pathlib import Path +from typing import Any, Dict, List, Optional, Union try: import yaml # type: ignore except Exception: # pragma: no cover - yaml = None # will skip YAML loading if unavailable + yaml = None - -def _coerce_mapping(obj: Any) -> Dict[str, Any]: - if obj is None: - return {{}} +def _get(obj: Any, key: str, default: Any = None) -> Any: if isinstance(obj, dict): - return dict(obj) - out: Dict[str, Any] = {{}} - if hasattr(obj, "__dict__"): - out.update(getattr(obj, "__dict__")) - for k in ("id","note_id","title","type","path","source_path","frontmatter"): - if hasattr(obj, k) and k not in out: - out[k] = getattr(obj, k) - return out + return obj.get(key, default) + return getattr(obj, key, default) +def _frontmatter(obj: Any) -> Dict[str, Any]: + fm = _get(obj, "frontmatter", {}) or {} + return fm if isinstance(fm, dict) else {} -def _coerce_chunk_dict(obj: Any) -> Dict[str, Any]: - if isinstance(obj, dict): - return dict(obj) - d = {{}} - # common attributes for a chunk object - for k in ("chunk_id","id","note_id","seq","start","end","text","title","type","source_path"): - if hasattr(obj, k): - d[k] = getattr(obj, k) - if hasattr(obj, "__dict__"): - for k,v in obj.__dict__.items(): - d.setdefault(k, v) - return d - - -def _get_frontmatter(parsed: Dict[str, Any]) -> Dict[str, Any]: - fm = parsed.get("frontmatter") - return dict(fm) if isinstance(fm, dict) else {{}} - - -def _load_types_from_yaml(types_file: Optional[Union[str, Path]]) -> Dict[str, Any]: - if types_file is None: - for cand in (Path("config/types.yaml"), Path("config/types.yml"), Path("config.yaml"), Path("config.yml")): - if cand.exists(): - types_file = cand - break - if types_file is None or yaml is None: - return {{}} - p = Path(types_file) - if not p.exists(): - return {{}} +def _coerce_float(val: Any, default: float) -> float: try: - data = yaml.safe_load(p.read_text(encoding="utf-8")) - if not isinstance(data, dict): - return {{}} - if "types" in data and isinstance(data["types"], dict): - return dict(data["types"]) - return data + if val is None: + return default + if isinstance(val, (int, float)): + return float(val) + if isinstance(val, str) and val.strip(): + return float(val.strip()) except Exception: - return {{}} + pass + return default - -def _resolve_type_defaults(note_type: Optional[str], types: Optional[Dict[str,Any]]) -> Dict[str, Any]: - if not note_type or not types or not isinstance(types, dict): - return {{}} - block = types.get(note_type) - return dict(block) if isinstance(block, dict) else {{}} - - -def _to_float(val: Any, fallback: float) -> float: - if val is None: - return fallback - try: - return float(val) - except Exception: +def _normalize_chunk_profile(val: Any, fallback: str = "medium") -> str: + if not isinstance(val, str): return fallback + v = val.strip().lower() + if v in {"short", "medium", "long"}: + return v + return fallback +def _safe_text(s: Any) -> str: + if s is None: + return "" + if isinstance(s, str): + return s + return str(s) -def _first_nonempty(*vals): - for v in vals: - if v is not None: - if isinstance(v, str) and v.strip() == "": - continue - return v - return None +def _load_types_config( + explicit_config: Optional[Dict[str, Any]] = None, + search_root: Union[str, Path, None] = None, +) -> Dict[str, Any]: + if explicit_config and isinstance(explicit_config, dict): + if "types" in explicit_config and isinstance(explicit_config["types"], dict): + return explicit_config + if yaml is None: + return {"types": {}} + candidates: List[Path] = [] + root = Path(search_root) if search_root else Path.cwd() + candidates.append(root / "config" / "config.yaml") + candidates.append(root / "config" / "types.yaml") + candidates.append(Path.cwd() / "config" / "config.yaml") + candidates.append(Path.cwd() / "config" / "types.yaml") + for p in candidates: + try: + if p.exists(): + import yaml as _y + with p.open("r", encoding="utf-8") as f: + loaded = _y.safe_load(f) or {} + if isinstance(loaded, dict) and isinstance(loaded.get("types"), dict): + return {"types": loaded["types"]} + except Exception: + continue + return {"types": {}} +def _type_defaults(note_type: str, cfg: Dict[str, Any]) -> Dict[str, Any]: + return (cfg.get("types") or {}).get(note_type, {}) if isinstance(cfg, dict) else {} -def make_chunk_payloads(parsed_note: Any, chunks: List[Any], **kwargs) -> List[Dict[str, Any]]: - parsed = _coerce_mapping(parsed_note) - fm = _get_frontmatter(parsed) +def make_chunk_payloads( + parsed_note: Any, + config: Optional[Dict[str, Any]] = None, + **kwargs: Any, +) -> List[Dict[str, Any]]: + search_root = kwargs.get("search_root") + fm = _frontmatter(parsed_note) + note_type = fm.get("type") or _get(parsed_note, "type") or "concept" + note_type = str(note_type).strip().lower() - # external sources - types_registry = kwargs.get("types") or kwargs.get("types_registry") - types_from_yaml = _load_types_from_yaml(kwargs.get("types_file")) - types_all: Dict[str, Any] = types_registry if isinstance(types_registry, dict) else types_from_yaml + cfg = _load_types_config(config, search_root) + defaults = _type_defaults(note_type, cfg) - note_type: Optional[str] = _first_nonempty(parsed.get("type"), fm.get("type")) - type_defaults = _resolve_type_defaults(note_type, types_all) + # Resolve retriever_weight: FM > type-defaults > ENV > 1.0 + rw = fm.get("retriever_weight") + if rw is None: + rw = defaults.get("retriever_weight") + if rw is None: + env_rw = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") + rw = _coerce_float(env_rw, 1.0) + else: + rw = _coerce_float(rw, 1.0) - env_default = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") - env_default_val = _to_float(env_default, 1.0) if env_default is not None else 1.0 + # Resolve chunk_profile: FM > type-defaults > ENV > medium + cp = fm.get("chunk_profile") + if cp is None: + cp = defaults.get("chunk_profile") + if cp is None: + cp = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE", "medium") + cp = _normalize_chunk_profile(cp, "medium") - effective_retriever_weight = _to_float( - _first_nonempty( - fm.get("retriever_weight"), - type_defaults.get("retriever_weight"), - env_default_val, - 1.0, - ), - 1.0, - ) + note_id = _get(parsed_note, "id") + note_title = _get(parsed_note, "title") + body = _get(parsed_note, "body") or _get(parsed_note, "text") or "" - effective_chunk_profile = _first_nonempty( - fm.get("chunk_profile"), - fm.get("profile"), - type_defaults.get("chunk_profile"), - os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE"), - ) + items = _get(parsed_note, "chunks") or [] + payloads: List[Dict[str, Any]] = [] - out: List[Dict[str, Any]] = [] - for ch in chunks or []: - payload = _coerce_chunk_dict(ch) # preserve all existing chunk fields - payload["retriever_weight"] = effective_retriever_weight - if effective_chunk_profile is not None: - payload["chunk_profile"] = effective_chunk_profile - out.append(payload) - return out + if not items: + items = [{ + "id": f"{note_id}::0" if note_id else None, + "text": body, + "order": 0, + "section": None, + "start": 0, + "end": len(body) if isinstance(body, str) else None, + }] + + for ch in items: + text = _safe_text(_get(ch, "text")) + if not text: + text = _safe_text(body) + + payload = { + "note_id": note_id, + "note_title": note_title, + "type": note_type, + "retriever_weight": float(rw), + "chunk_profile": cp, + "text": text, + "order": _get(ch, "order"), + "section": _get(ch, "section"), + "start": _get(ch, "start"), + "end": _get(ch, "end"), + "chunk_id": _get(ch, "id"), + } + payload = {k: v for k, v in payload.items() if v is not None} + payloads.append(payload) + + return payloads diff --git a/app/core/note_payload.py b/app/core/note_payload.py index 3eab9c5..33a0eb2 100644 --- a/app/core/note_payload.py +++ b/app/core/note_payload.py @@ -1,201 +1,252 @@ """ -note_payload.py — Mindnet payload helpers -Version: 0.5.2 (generated 2025-11-08 21:03:48) -Purpose: - - Build a NOTE payload without dropping existing fields. - - Resolve and inject: - * retriever_weight - * chunk_profile - * edge_defaults -Resolution order: - 1) Frontmatter fields - 2) Type defaults from a provided registry ('types' kwarg) OR YAML file (types_file kwarg). - YAML formats supported: - - root['types'][note_type]{{retriever_weight, chunk_profile, edge_defaults}} - - root[note_type] is the type block directly - 3) ENV MINDNET_DEFAULT_RETRIEVER_WEIGHT - 4) Fallback 1.0 -Notes: - - Function signature tolerant: accepts **kwargs (e.g. vault_root, types_file, types, types_registry). - - Does NOT attempt to create edges; it only exposes 'edge_defaults' in the NOTE payload for later stages. +note_payload.py — Mindnet payload builder (Notes) +Version: 1.3.0 (2025-11-09) + +Purpose +------- +Build Qdrant-compatible JSON payloads for *notes* from a parsed Markdown +representation. The function is tolerant to different call signatures and +accepts both dict-like and object-like "ParsedNote" inputs. + +Key features +------------ +- Reads type defaults from `config/config.yaml` or `config/types.yaml` (same schema). +- Resolves fields with the following precedence: + Frontmatter > type-defaults > ENV > hard-coded fallback. +- Ensures only JSON-serializable types are included (no sets, Path, callables). +- Sets/normalizes: + * `type` : note type (e.g., concept, task, experience, project) + * `retriever_weight` : float, influences retrieval blending downstream + * `chunk_profile` : short | medium | long (string) + * `edge_defaults` : list[str], used by edge builder outside of this module +- Backwards-compatible signature: accepts **kwargs to swallow unknown args + (e.g., vault_root, prefix, ...). + +Expected input (flexible) +------------------------- +`parsed_note` may be: + - dict with keys: id, title, body/text, path, frontmatter (dict), type, ... + - object with attributes: id, title, body/text, path, frontmatter, type, ... + +Schema for config files +----------------------- +version: 1.0 +types: + concept: + chunk_profile: medium + edge_defaults: ["references", "related_to"] + retriever_weight: 0.33 + task: + chunk_profile: short + edge_defaults: ["depends_on", "belongs_to"] + retriever_weight: 0.8 + experience: + chunk_profile: medium + edge_defaults: ["derived_from", "inspired_by"] + retriever_weight: 0.9 + project: + chunk_profile: long + edge_defaults: ["references", "depends_on"] + retriever_weight: 0.95 """ from __future__ import annotations -from typing import Any, Dict, Optional, Mapping, Union + +import json import os from pathlib import Path +from typing import Any, Dict, List, Optional, Union try: import yaml # type: ignore except Exception: # pragma: no cover - yaml = None # will skip YAML loading if unavailable + yaml = None # The caller must ensure PyYAML is installed +# ------------------------------ +# Helpers +# ------------------------------ -# -------- helpers -------- - -def _coerce_mapping(obj: Any) -> Dict[str, Any]: - if obj is None: - return {{}} +def _get(obj: Any, key: str, default: Any = None) -> Any: + """Get key from dict-like or attribute from object-like.""" if isinstance(obj, dict): - return dict(obj) - # try common attributes - out: Dict[str, Any] = {{}} - for k in ("__dict__",): - if hasattr(obj, k): - out.update(getattr(obj, k)) - # named attributes we often see - for k in ("id","note_id","title","type","path","source_path","frontmatter"): - if hasattr(obj, k) and k not in out: - out[k] = getattr(obj, k) - return out + return obj.get(key, default) + return getattr(obj, key, default) +def _frontmatter(obj: Any) -> Dict[str, Any]: + fm = _get(obj, "frontmatter", {}) or {} + return fm if isinstance(fm, dict) else {} -def _get_frontmatter(parsed: Mapping[str, Any]) -> Dict[str, Any]: - fm = parsed.get("frontmatter") - if isinstance(fm, dict): - return dict(fm) - return {{}} # tolerate notes without frontmatter - - -def _load_types_from_yaml(types_file: Optional[Union[str, Path]]) -> Dict[str, Any]: - if types_file is None: - # try common defaults - candidates = [ - Path("config/types.yaml"), - Path("config/types.yml"), - Path("config.yaml"), - Path("config.yml"), - ] - for p in candidates: - if p.exists(): - types_file = p - break - if types_file is None: - return {{}} - p = Path(types_file) - if not p.exists() or yaml is None: - return {{}} +def _coerce_float(val: Any, default: float) -> float: try: - data = yaml.safe_load(p.read_text(encoding="utf-8")) - if not isinstance(data, dict): - return {{}} - # support both shapes: {{types: {{concept: ...}}}} OR {{concept: ...}} - if "types" in data and isinstance(data["types"], dict): - return dict(data["types"]) - return data + if val is None: + return default + if isinstance(val, (int, float)): + return float(val) + if isinstance(val, str) and val.strip(): + return float(val.strip()) except Exception: - return {{}} + pass + return default +def _normalize_chunk_profile(val: Any, fallback: str = "medium") -> str: + if not isinstance(val, str): + return fallback + v = val.strip().lower() + if v in {"short", "medium", "long"}: + return v + return fallback -def _resolve_type_defaults(note_type: Optional[str], types: Optional[Dict[str,Any]]) -> Dict[str, Any]: - defaults = {{}} - if not note_type or not types or not isinstance(types, dict): - return defaults - block = types.get(note_type) - if isinstance(block, dict): - defaults.update(block) - return defaults - - -def _to_float(val: Any, fallback: float) -> float: +def _coerce_str_list(val: Any) -> List[str]: if val is None: - return fallback + return [] + if isinstance(val, list): + out: List[str] = [] + for x in val: + if isinstance(x, str): + out.append(x) + else: + out.append(str(x)) + return out + if isinstance(val, str): + # allow comma-separated + return [x.strip() for x in val.split(",") if x.strip()] + return [] + +def _safe_jsonable(value: Any) -> Any: + """Ensure value is JSON-serializable (no sets, Path, callables, etc.).""" + if isinstance(value, (str, int, float, bool)) or value is None: + return value + if isinstance(value, list): + return [_safe_jsonable(v) for v in value] + if isinstance(value, dict): + return {str(k): _safe_jsonable(v) for k, v in value.items()} + if isinstance(value, Path): + return str(value) + # Avoid sets and other iterables that are not JSON-serializable try: - return float(val) + json.dumps(value) + return value except Exception: - return fallback + return str(value) +# ------------------------------ +# Config loading +# ------------------------------ -def _first_nonempty(*vals): - for v in vals: - if v is not None: - if isinstance(v, str) and v.strip() == "": +def _load_types_config( + explicit_config: Optional[Dict[str, Any]] = None, + search_root: Union[str, Path, None] = None, +) -> Dict[str, Any]: + """ + Load types config from: + 1) explicit_config (if provided) + 2) {search_root}/config/config.yaml + 3) {search_root}/config/types.yaml + 4) ./config/config.yaml + 5) ./config/types.yaml + Returns a dict with shape: {"types": {...}} (empty if none found). + """ + if explicit_config and isinstance(explicit_config, dict): + if "types" in explicit_config and isinstance(explicit_config["types"], dict): + return explicit_config + + candidates: List[Path] = [] + root = Path(search_root) if search_root else Path.cwd() + candidates.append(root / "config" / "config.yaml") + candidates.append(root / "config" / "types.yaml") + # fallback to CWD when search_root was different + candidates.append(Path.cwd() / "config" / "config.yaml") + candidates.append(Path.cwd() / "config" / "types.yaml") + + data = {} + if yaml is None: + return {"types": {}} + + for p in candidates: + try: + if p.exists(): + with p.open("r", encoding="utf-8") as f: + loaded = yaml.safe_load(f) or {} + if isinstance(loaded, dict) and isinstance(loaded.get("types"), dict): + data = {"types": loaded["types"]} + break + except Exception: + continue + if not data: + data = {"types": {}} + return data + +def _type_defaults(note_type: str, cfg: Dict[str, Any]) -> Dict[str, Any]: + return (cfg.get("types") or {}).get(note_type, {}) if isinstance(cfg, dict) else {} + +# ------------------------------ +# Public API +# ------------------------------ + +def make_note_payload( + parsed_note: Any, + *, + config: Optional[Dict[str, Any]] = None, + search_root: Union[str, Path, None] = None, + **kwargs: Any, +) -> Dict[str, Any]: + """ + Build the payload for a NOTE. Tolerates extra kwargs (e.g., vault_root, prefix). + """ + fm = _frontmatter(parsed_note) + note_type = fm.get("type") or _get(parsed_note, "type") or "concept" + note_type = str(note_type).strip().lower() + + # Load config and resolve defaults + cfg = _load_types_config(config, search_root) + defaults = _type_defaults(note_type, cfg) + + # retriever_weight: FM > type-defaults > ENV > 1.0 + rw = fm.get("retriever_weight") + if rw is None: + rw = defaults.get("retriever_weight") + if rw is None: + env_rw = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") + rw = _coerce_float(env_rw, 1.0) + else: + rw = _coerce_float(rw, 1.0) + + # chunk_profile: FM > type-defaults > ENV > medium + cp = fm.get("chunk_profile") + if cp is None: + cp = defaults.get("chunk_profile") + if cp is None: + cp = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE", "medium") + cp = _normalize_chunk_profile(cp, "medium") + + # edge_defaults: FM > type-defaults > empty + edge_defs = fm.get("edge_defaults") + if edge_defs is None: + edge_defs = defaults.get("edge_defaults", []) + edge_defs = _coerce_str_list(edge_defs) + + payload: Dict[str, Any] = { + "id": _get(parsed_note, "id"), + "note_id": _get(parsed_note, "id"), + "title": _get(parsed_note, "title"), + "type": note_type, + "retriever_weight": float(rw), + "chunk_profile": cp, + "edge_defaults": edge_defs, + # Useful passthrough/meta (all made JSON-safe) + "path": _safe_jsonable(_get(parsed_note, "path")), + "source": _safe_jsonable(_get(parsed_note, "source")), + } + + # Include raw frontmatter keys (stringify keys; make safe) + if isinstance(fm, dict): + for k, v in fm.items(): + # avoid overwriting normalized fields + if k in {"type", "retriever_weight", "chunk_profile", "edge_defaults"}: continue - return v - return None - - -# -------- main API -------- - -def make_note_payload(parsed_note: Any, **kwargs) -> Dict[str, Any]: - parsed = _coerce_mapping(parsed_note) - fm = _get_frontmatter(parsed) - - # external sources - types_registry = kwargs.get("types") or kwargs.get("types_registry") - types_from_yaml = _load_types_from_yaml(kwargs.get("types_file")) - # registry wins over YAML if provided - types_all: Dict[str, Any] = types_registry if isinstance(types_registry, dict) else types_from_yaml - - note_type: Optional[str] = _first_nonempty(parsed.get("type"), fm.get("type")) - title: Optional[str] = _first_nonempty(parsed.get("title"), fm.get("title")) - note_id: Optional[str] = _first_nonempty(parsed.get("note_id"), parsed.get("id"), fm.get("id")) - - type_defaults = _resolve_type_defaults(note_type, types_all) - - # --- resolve retriever_weight --- - env_default = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") - env_default_val = _to_float(env_default, 1.0) if env_default is not None else 1.0 - - effective_retriever_weight = _to_float( - _first_nonempty( - fm.get("retriever_weight"), - type_defaults.get("retriever_weight"), - env_default_val, - 1.0, - ), - 1.0, - ) - - # --- resolve chunk_profile --- - effective_chunk_profile = _first_nonempty( - fm.get("chunk_profile"), - fm.get("profile"), - type_defaults.get("chunk_profile"), - os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE"), - ) - - # --- resolve edge_defaults (list[str]) --- - edge_defaults = _first_nonempty( - fm.get("edge_defaults"), - type_defaults.get("edge_defaults"), - ) - if edge_defaults is None: - edge_defaults = [] - if isinstance(edge_defaults, str): - # allow "a,b,c" - edge_defaults = [s.strip() for s in edge_defaults.split(",") if s.strip()] - elif not isinstance(edge_defaults, list): - edge_defaults = [] - - # Start payload by preserving existing parsed keys (shallow copy); DO NOT drop fields - payload: Dict[str, Any] = dict(parsed) - - # Ensure canonical top-level fields - if note_id is not None: - payload["id"] = note_id - payload["note_id"] = note_id - if title is not None: - payload["title"] = title - if note_type is not None: - payload["type"] = note_type - - payload["retriever_weight"] = effective_retriever_weight - if effective_chunk_profile is not None: - payload["chunk_profile"] = effective_chunk_profile - if edge_defaults: - payload["edge_defaults"] = edge_defaults - - # keep frontmatter merged (without duplication) - if "frontmatter" in payload and isinstance(payload["frontmatter"], dict): - fm_out = dict(payload["frontmatter"]) - fm_out.setdefault("type", note_type) - fm_out["retriever_weight"] = effective_retriever_weight - if effective_chunk_profile is not None: - fm_out["chunk_profile"] = effective_chunk_profile - if edge_defaults: - fm_out["edge_defaults"] = edge_defaults - payload["frontmatter"] = fm_out + payload[f"fm_{k}"] = _safe_jsonable(v) + # Remove None values to keep payload clean + payload = {k: v for k, v in payload.items() if v is not None} return payload