diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index 22cba73..703fee6 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,158 +1,220 @@ +# app/core/chunk_payload.py +# Line count: 214 + from __future__ import annotations -from typing import Any, Dict, Iterable, List, Optional, Union -# ---- Helpers ---- -def _coerce_float(val: Any) -> Optional[float]: +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + if obj is None: + return default + if hasattr(obj, key): + try: + val = getattr(obj, key) + return val if val is not None else default + except Exception: + pass + if isinstance(obj, dict): + if key in obj: + val = obj.get(key, default) + return val if val is not None else default + return default + + +def _get_frontmatter(note: Any) -> Dict[str, Any]: + fm = _get(note, "frontmatter", None) + if isinstance(fm, dict): + return fm + meta = _get(note, "meta", None) + if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict): + return meta["frontmatter"] + return {} + + +def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any: + if not isinstance(fm, dict): + return default + if key in fm: + val = fm.get(key, default) + return val if val is not None else default + return default + + +def _coerce_tags(val: Any) -> List[str]: if val is None: - return None - try: - if isinstance(val, (int, float)): - return float(val) - if isinstance(val, str): - v = val.strip() - if not v: - return None - return float(v.replace(",", ".")) - except Exception: - return None - return None - -def _extract_weight(frontmatter: Dict[str, Any], explicit: Optional[float]) -> Optional[float]: - if explicit is not None: - return _coerce_float(explicit) - if frontmatter is None: - return None - if "retriever_weight" in frontmatter: - return _coerce_float(frontmatter.get("retriever_weight")) - # also accept nested style: retriever: { weight: 0.8 } - retriever = frontmatter.get("retriever") - if isinstance(retriever, dict) and "weight" in retriever: - return _coerce_float(retriever.get("weight")) - return None - -def _ensure_list(x: Any) -> List[Any]: - if x is None: return [] - if isinstance(x, list): - return x - return [x] + if isinstance(val, list): + return [str(x) for x in val] + if isinstance(val, str): + parts = [t.strip() for t in val.split(",")] + return [p for p in parts if p] + return [] -def _resolve_note_id(frontmatter: Dict[str, Any], kw_note_id: Optional[str]) -> Optional[str]: - if kw_note_id: - return kw_note_id - if not isinstance(frontmatter, dict): - return None - return frontmatter.get("id") or frontmatter.get("note_id") -def _base_fields(frontmatter: Dict[str, Any], note_id: Optional[str], path: str) -> Dict[str, Any]: - title = None - typ = None - tags = None - if isinstance(frontmatter, dict): - title = frontmatter.get("title") - typ = frontmatter.get("type") or frontmatter.get("note_type") - # tags can be list[str] or comma separated string - tags = frontmatter.get("tags") - if isinstance(tags, str): - tags = [t.strip() for t in tags.split(",") if t.strip()] +def _resolve_retriever_weight( + fm: Dict[str, Any], + explicit: Optional[float], +) -> Optional[float]: + if explicit is not None: + return explicit + val = _get_from_frontmatter(fm, "retriever_weight", None) + if isinstance(val, (int, float)): + return float(val) + retr = fm.get("retriever") + if isinstance(retr, dict): + v = retr.get("weight") + if isinstance(v, (int, float)): + return float(v) + return None + + +def _resolve_note_fields(note: Any) -> Dict[str, Any]: + fm = _get_frontmatter(note) + + note_id = _get_from_frontmatter(fm, "id", None) + if note_id is None: + note_id = _get(note, "note_id", None) + if note_id is None: + note_id = _get(note, "id", None) + + title = _get_from_frontmatter(fm, "title", None) + if title is None: + title = _get(note, "title", None) + + ntype = _get_from_frontmatter(fm, "type", None) + if ntype is None: + ntype = _get(note, "type", None) + + tags = _get_from_frontmatter(fm, "tags", None) + if tags is None: + tags = _get(note, "tags", None) + tags = _coerce_tags(tags) + + path = _get_from_frontmatter(fm, "path", None) + if path is None: + path = _get(note, "path", None) + if path is None: + path = _get(note, "source", None) + if path is None: + path = _get(note, "filepath", None) + return { "note_id": note_id, "title": title, - "type": typ, + "type": ntype, "tags": tags, - "path": path or None, + "path": path, + "frontmatter": fm, } -# ---- Public API ---- + +def _extract_chunk_text_and_index( + chunk: Any, + fallback_index: int, +) -> Tuple[str, int]: + """ + Akzeptiert verschiedene Chunk-Formate: + - str (reiner Text) + - dict mit keys: text | window | body | content + - Objekt mit Attributen: text | window | body | content + - (text, idx) Tuple + """ + # Tuple (text, idx) + if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str): + txt, idx = chunk + try: + idx_int = int(idx) + except Exception: + idx_int = fallback_index + return txt, idx_int + + # String + if isinstance(chunk, str): + return chunk, fallback_index + + # Dict + if isinstance(chunk, dict): + txt = ( + chunk.get("text") + or chunk.get("window") + or chunk.get("body") + or chunk.get("content") + ) + if isinstance(txt, str): + idx = chunk.get("index") + try: + idx_int = int(idx) if idx is not None else fallback_index + except Exception: + idx_int = fallback_index + return txt, idx_int + + # Objekt mit Attributen + for attr in ("text", "window", "body", "content"): + if hasattr(chunk, attr): + try: + txt = getattr(chunk, attr) + except Exception: + txt = None + if isinstance(txt, str): + # Optionale "index"-Quelle + idx = None + if hasattr(chunk, "index"): + try: + idx = getattr(chunk, "index") + except Exception: + idx = None + try: + idx_int = int(idx) if idx is not None else fallback_index + except Exception: + idx_int = fallback_index + return txt, idx_int + + # Wenn nichts passt -> klarer Fehler + raise ValueError("Unsupported chunk format: cannot extract text/index") + + def make_chunk_payloads( - frontmatter: Dict[str, Any], - *args, - note_id: Optional[str] = None, - chunks: Optional[Iterable[Any]] = None, - path: str = "", - chunk_profile: Optional[str] = None, + note: Any, + chunks: Iterable[Any], + *, retriever_weight: Optional[float] = None, - **kwargs, + base_payload: Optional[Dict[str, Any]] = None, ) -> List[Dict[str, Any]]: """ - Build chunk payload dictionaries for Qdrant. - - This function is intentionally permissive to stay compatible with older callers: - - If `chunks` is a list of dictionaries that already contain payload-like fields, - those are augmented. - - If `chunks` is a list of strings, minimal payloads are created. - - If `chunks` is a list of dicts with keys like `text`, `window`, or `index`, they are normalized. - - Always injects `retriever_weight` into each payload when available (from explicit arg or frontmatter). + Erzeugt Qdrant-Payloads für Chunk-Punkte. + - Kopiert Note-Metadaten (note_id/title/type/tags/path) + - Schreibt text + chunk_index je Chunk + - Setzt retriever_weight, wenn vorhanden/angegeben """ - # Backward-compat for callers that might pass via kwargs - if chunks is None: - chunks = kwargs.get("payloads") or kwargs.get("pls") or kwargs.get("items") or kwargs.get("chunk_items") - - note_id_resolved = _resolve_note_id(frontmatter, note_id) - weight = _extract_weight(frontmatter, retriever_weight) - base = _base_fields(frontmatter, note_id_resolved, path) - out: List[Dict[str, Any]] = [] - for idx, item in enumerate(_ensure_list(chunks)): - # Case A: already a full payload dict (heuristic: has 'text' or 'window' or 'note_id' keys) - if isinstance(item, dict) and ("text" in item or "window" in item or "note_id" in item): - pl = dict(item) # shallow copy - # ensure base fields exist if missing - for k, v in base.items(): - pl.setdefault(k, v) - # ensure chunk_index if not present - pl.setdefault("chunk_index", item.get("index", idx)) - # inject retriever_weight - if weight is not None: - pl["retriever_weight"] = weight - out.append(pl) - continue + note_fields = _resolve_note_fields(note) + fm = note_fields["frontmatter"] + rw = _resolve_retriever_weight(fm, retriever_weight) - # Case B: item is a dict with nested 'payload' - if isinstance(item, dict) and "payload" in item and isinstance(item["payload"], dict): - pl = dict(item["payload"]) - for k, v in base.items(): - pl.setdefault(k, v) - pl.setdefault("chunk_index", pl.get("index", idx)) - if weight is not None: - pl["retriever_weight"] = weight - out.append(pl) - continue + # Basisfelder, die jeder Chunk tragen soll + common: Dict[str, Any] = {} + if base_payload: + common.update({k: v for k, v in base_payload.items() if v is not None}) - # Case C: item is a plain string -> treat as text (no window context) - if isinstance(item, str): - text_val = item - pl = { - **base, - "chunk_index": idx, - "text": text_val, - "window": text_val, - } - if weight is not None: - pl["retriever_weight"] = weight - out.append(pl) - continue + if note_fields.get("note_id") is not None: + common["note_id"] = note_fields["note_id"] + if note_fields.get("title") is not None: + common["title"] = note_fields["title"] + if note_fields.get("type") is not None: + common["type"] = note_fields["type"] + if note_fields.get("tags"): + common["tags"] = note_fields["tags"] + if note_fields.get("path") is not None: + common["path"] = note_fields["path"] + if rw is not None: + common["retriever_weight"] = rw - # Case D: item has 'text'/'window' under different names - if isinstance(item, dict): - text_val = item.get("text") or item.get("body") or item.get("content") or "" - window_val = item.get("window") or text_val - pl = { - **base, - "chunk_index": item.get("chunk_index", item.get("index", idx)), - "text": text_val, - "window": window_val, - } - if weight is not None: - pl["retriever_weight"] = weight - out.append(pl) - continue - - # Fallback: minimal payload - pl = {**base, "chunk_index": idx} - if weight is not None: - pl["retriever_weight"] = weight - out.append(pl) + for i, ch in enumerate(chunks): + text, idx = _extract_chunk_text_and_index(ch, i) + payload = dict(common) # copy + payload["chunk_index"] = idx + payload["text"] = text + out.append(payload) return out