mindnet/app/core/chunk_payload.py
Lars b2043f4f84
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
Dateien nach "app/core" hochladen
2025-11-08 21:19:18 +01:00

159 lines
5.6 KiB
Python

from __future__ import annotations
from typing import Any, Dict, Iterable, List, Optional, Union
# ---- Helpers ----
def _coerce_float(val: Any) -> Optional[float]:
if val is None:
return None
try:
if isinstance(val, (int, float)):
return float(val)
if isinstance(val, str):
v = val.strip()
if not v:
return None
return float(v.replace(",", "."))
except Exception:
return None
return None
def _extract_weight(frontmatter: Dict[str, Any], explicit: Optional[float]) -> Optional[float]:
if explicit is not None:
return _coerce_float(explicit)
if frontmatter is None:
return None
if "retriever_weight" in frontmatter:
return _coerce_float(frontmatter.get("retriever_weight"))
# also accept nested style: retriever: { weight: 0.8 }
retriever = frontmatter.get("retriever")
if isinstance(retriever, dict) and "weight" in retriever:
return _coerce_float(retriever.get("weight"))
return None
def _ensure_list(x: Any) -> List[Any]:
if x is None:
return []
if isinstance(x, list):
return x
return [x]
def _resolve_note_id(frontmatter: Dict[str, Any], kw_note_id: Optional[str]) -> Optional[str]:
if kw_note_id:
return kw_note_id
if not isinstance(frontmatter, dict):
return None
return frontmatter.get("id") or frontmatter.get("note_id")
def _base_fields(frontmatter: Dict[str, Any], note_id: Optional[str], path: str) -> Dict[str, Any]:
title = None
typ = None
tags = None
if isinstance(frontmatter, dict):
title = frontmatter.get("title")
typ = frontmatter.get("type") or frontmatter.get("note_type")
# tags can be list[str] or comma separated string
tags = frontmatter.get("tags")
if isinstance(tags, str):
tags = [t.strip() for t in tags.split(",") if t.strip()]
return {
"note_id": note_id,
"title": title,
"type": typ,
"tags": tags,
"path": path or None,
}
# ---- Public API ----
def make_chunk_payloads(
frontmatter: Dict[str, Any],
*args,
note_id: Optional[str] = None,
chunks: Optional[Iterable[Any]] = None,
path: str = "",
chunk_profile: Optional[str] = None,
retriever_weight: Optional[float] = None,
**kwargs,
) -> List[Dict[str, Any]]:
"""
Build chunk payload dictionaries for Qdrant.
This function is intentionally permissive to stay compatible with older callers:
- If `chunks` is a list of dictionaries that already contain payload-like fields,
those are augmented.
- If `chunks` is a list of strings, minimal payloads are created.
- If `chunks` is a list of dicts with keys like `text`, `window`, or `index`, they are normalized.
Always injects `retriever_weight` into each payload when available (from explicit arg or frontmatter).
"""
# Backward-compat for callers that might pass via kwargs
if chunks is None:
chunks = kwargs.get("payloads") or kwargs.get("pls") or kwargs.get("items") or kwargs.get("chunk_items")
note_id_resolved = _resolve_note_id(frontmatter, note_id)
weight = _extract_weight(frontmatter, retriever_weight)
base = _base_fields(frontmatter, note_id_resolved, path)
out: List[Dict[str, Any]] = []
for idx, item in enumerate(_ensure_list(chunks)):
# Case A: already a full payload dict (heuristic: has 'text' or 'window' or 'note_id' keys)
if isinstance(item, dict) and ("text" in item or "window" in item or "note_id" in item):
pl = dict(item) # shallow copy
# ensure base fields exist if missing
for k, v in base.items():
pl.setdefault(k, v)
# ensure chunk_index if not present
pl.setdefault("chunk_index", item.get("index", idx))
# inject retriever_weight
if weight is not None:
pl["retriever_weight"] = weight
out.append(pl)
continue
# Case B: item is a dict with nested 'payload'
if isinstance(item, dict) and "payload" in item and isinstance(item["payload"], dict):
pl = dict(item["payload"])
for k, v in base.items():
pl.setdefault(k, v)
pl.setdefault("chunk_index", pl.get("index", idx))
if weight is not None:
pl["retriever_weight"] = weight
out.append(pl)
continue
# Case C: item is a plain string -> treat as text (no window context)
if isinstance(item, str):
text_val = item
pl = {
**base,
"chunk_index": idx,
"text": text_val,
"window": text_val,
}
if weight is not None:
pl["retriever_weight"] = weight
out.append(pl)
continue
# Case D: item has 'text'/'window' under different names
if isinstance(item, dict):
text_val = item.get("text") or item.get("body") or item.get("content") or ""
window_val = item.get("window") or text_val
pl = {
**base,
"chunk_index": item.get("chunk_index", item.get("index", idx)),
"text": text_val,
"window": window_val,
}
if weight is not None:
pl["retriever_weight"] = weight
out.append(pl)
continue
# Fallback: minimal payload
pl = {**base, "chunk_index": idx}
if weight is not None:
pl["retriever_weight"] = weight
out.append(pl)
return out