All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
219 lines
7.5 KiB
Python
219 lines
7.5 KiB
Python
# app/core/chunk_payload.py
|
|
# Version: 1.2.0 (2025-11-08)
|
|
# Purpose:
|
|
# Build robust Qdrant payloads for CHUNK points.
|
|
#
|
|
# Highlights:
|
|
# - Works with dict-like chunks and simple objects; supports (text, idx) tuples.
|
|
# - Accepts legacy/extra kwargs (e.g., vault_root) without failing.
|
|
# - Copies canonical note fields onto each chunk (note_id/title/type/tags/path).
|
|
# - Sets 'text' and 'chunk_index' per chunk.
|
|
# - Reliably propagates `retriever_weight` onto every chunk if provided in
|
|
# frontmatter or explicitly.
|
|
#
|
|
# Usage:
|
|
# payloads = make_chunk_payloads(note, chunks, retriever_weight=None, base_payload=None, vault_root="/path/to/vault")
|
|
#
|
|
# Changelog:
|
|
# 1.2.0 (2025-11-08) Accept legacy kwargs, robust getters, propagate retriever_weight.
|
|
# 1.1.0 (2025-11-08) Initial robust rewrite with attribute/dict support.
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
|
|
|
|
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
|
if obj is None:
|
|
return default
|
|
if hasattr(obj, key):
|
|
try:
|
|
val = getattr(obj, key)
|
|
return default if val is None else val
|
|
except Exception:
|
|
pass
|
|
if isinstance(obj, dict):
|
|
if key in obj:
|
|
val = obj.get(key, default)
|
|
return default if val is None else val
|
|
return default
|
|
|
|
|
|
def _get_frontmatter(note: Any) -> Dict[str, Any]:
|
|
fm = _get(note, "frontmatter", None)
|
|
if isinstance(fm, dict):
|
|
return fm
|
|
meta = _get(note, "meta", None)
|
|
if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict):
|
|
return meta["frontmatter"]
|
|
return {}
|
|
|
|
|
|
def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any:
|
|
if not isinstance(fm, dict):
|
|
return default
|
|
if key in fm:
|
|
val = fm.get(key, default)
|
|
return default if val is None else val
|
|
return default
|
|
|
|
|
|
def _coerce_tags(val: Any) -> List[str]:
|
|
if val is None:
|
|
return []
|
|
if isinstance(val, list):
|
|
return [str(x) for x in val]
|
|
if isinstance(val, str):
|
|
parts = [t.strip() for t in val.split(",")]
|
|
return [p for p in parts if p]
|
|
return []
|
|
|
|
|
|
def _resolve_retriever_weight(fm: Dict[str, Any], explicit: Optional[float]) -> Optional[float]:
|
|
if explicit is not None:
|
|
try:
|
|
return float(explicit)
|
|
except Exception:
|
|
return None
|
|
val = _get_from_frontmatter(fm, "retriever_weight", None)
|
|
if isinstance(val, (int, float)):
|
|
return float(val)
|
|
retr = fm.get("retriever")
|
|
if isinstance(retr, dict):
|
|
v = retr.get("weight")
|
|
if isinstance(v, (int, float)):
|
|
return float(v)
|
|
return None
|
|
|
|
|
|
def _resolve_path(note: Any, fm: Dict[str, Any], vault_root: Optional[str]) -> Optional[str]:
|
|
path = _get_from_frontmatter(fm, "path", None)
|
|
if path is None:
|
|
path = _get(note, "path", None) or _get(note, "source", None) or _get(note, "filepath", None)
|
|
if path is None:
|
|
return None
|
|
try:
|
|
if vault_root:
|
|
vr = Path(vault_root)
|
|
rel = Path(path)
|
|
try:
|
|
return str(rel.relative_to(vr))
|
|
except Exception:
|
|
return str(rel)
|
|
except Exception:
|
|
pass
|
|
return str(path)
|
|
|
|
|
|
def _resolve_note_fields(note: Any, vault_root: Optional[str]) -> Dict[str, Any]:
|
|
fm = _get_frontmatter(note)
|
|
note_id = _get_from_frontmatter(fm, "id", None) or _get(note, "note_id", None) or _get(note, "id", None)
|
|
title = _get_from_frontmatter(fm, "title", None) or _get(note, "title", None)
|
|
ntype = _get_from_frontmatter(fm, "type", None) or _get(note, "type", None)
|
|
tags = _coerce_tags(_get_from_frontmatter(fm, "tags", None) or _get(note, "tags", None))
|
|
path = _resolve_path(note, fm, vault_root)
|
|
return {"note_id": note_id, "title": title, "type": ntype, "tags": tags, "path": path, "frontmatter": fm}
|
|
|
|
|
|
def _extract_chunk_text_and_index(chunk: Any, fallback_index: int) -> Tuple[str, int]:
|
|
# (text, idx) tuple
|
|
if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str):
|
|
txt, idx = chunk
|
|
try:
|
|
idx_int = int(idx)
|
|
except Exception:
|
|
idx_int = fallback_index
|
|
return txt, idx_int
|
|
# string
|
|
if isinstance(chunk, str):
|
|
return chunk, fallback_index
|
|
# dict
|
|
if isinstance(chunk, dict):
|
|
txt = chunk.get("text") or chunk.get("window") or chunk.get("body") or chunk.get("content")
|
|
if isinstance(txt, str):
|
|
idx = chunk.get("index")
|
|
try:
|
|
idx_int = int(idx) if idx is not None else fallback_index
|
|
except Exception:
|
|
idx_int = fallback_index
|
|
return txt, idx_int
|
|
# object with attributes
|
|
for attr in ("text", "window", "body", "content"):
|
|
if hasattr(chunk, attr):
|
|
try:
|
|
txt = getattr(chunk, attr)
|
|
except Exception:
|
|
txt = None
|
|
if isinstance(txt, str):
|
|
idx = None
|
|
if hasattr(chunk, "index"):
|
|
try:
|
|
idx = getattr(chunk, "index")
|
|
except Exception:
|
|
idx = None
|
|
try:
|
|
idx_int = int(idx) if idx is not None else fallback_index
|
|
except Exception:
|
|
idx_int = fallback_index
|
|
return txt, idx_int
|
|
raise ValueError("Unsupported chunk format: cannot extract text/index")
|
|
|
|
|
|
def make_chunk_payloads(
|
|
note: Any,
|
|
chunks,
|
|
*,
|
|
retriever_weight: Optional[float] = None,
|
|
base_payload: Optional[Dict[str, Any]] = None,
|
|
vault_root: Optional[str] = None,
|
|
**kwargs,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Build Qdrant payloads for chunks from a parsed note and iterable of chunks.
|
|
|
|
Parameters
|
|
----------
|
|
note : Any
|
|
Parsed note (dict or object with attributes).
|
|
chunks : Iterable[Any]
|
|
Chunks; supports str, dicts with 'text'/'window'/'body'/'content', objects with same, or (text, idx) tuples.
|
|
retriever_weight : Optional[float]
|
|
Optional override; if None, value is read from frontmatter.
|
|
base_payload : Optional[Dict[str, Any]]
|
|
Extra fields to copy onto each chunk.
|
|
vault_root : Optional[str]
|
|
Optional base path to compute relative 'path' if possible.
|
|
**kwargs :
|
|
Ignored extra options to remain compatible with callers.
|
|
"""
|
|
note_fields = _resolve_note_fields(note, vault_root)
|
|
fm = note_fields["frontmatter"]
|
|
rw = _resolve_retriever_weight(fm, retriever_weight)
|
|
|
|
common: Dict[str, Any] = {}
|
|
if isinstance(base_payload, dict):
|
|
common.update({k: v for k, v in base_payload.items() if v is not None})
|
|
|
|
if note_fields.get("note_id") is not None:
|
|
common["note_id"] = note_fields["note_id"]
|
|
if note_fields.get("title") is not None:
|
|
common["title"] = note_fields["title"]
|
|
if note_fields.get("type") is not None:
|
|
common["type"] = note_fields["type"]
|
|
if note_fields.get("tags"):
|
|
common["tags"] = note_fields["tags"]
|
|
if note_fields.get("path") is not None:
|
|
common["path"] = note_fields["path"]
|
|
if rw is not None:
|
|
common["retriever_weight"] = rw
|
|
|
|
out: List[Dict[str, Any]] = []
|
|
for i, ch in enumerate(chunks):
|
|
text, idx = _extract_chunk_text_and_index(ch, i)
|
|
payload = dict(common)
|
|
payload["chunk_index"] = idx
|
|
payload["text"] = text
|
|
out.append(payload)
|
|
|
|
return out
|