#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ app/core/chunk_payload.py (Mindnet V2 — robust v2) Änderungen ggü. v1: - neighbors_prev / neighbors_next werden als **Array** persistiert ([], [id]). - retriever_weight / chunk_profile werden je Chunk aufgelöst (Frontmatter > types.yaml > Defaults). - Lädt config/types.yaml selbst, wenn types_cfg nicht übergeben wurde. """ from __future__ import annotations from typing import Any, Dict, List, Optional import os, yaml def _env(n: str, d: Optional[str]=None) -> str: v = os.getenv(n) return v if v is not None else (d or "") def _deep_get(root: Any, path: str) -> Any: cur = root for key in path.split("."): if not isinstance(cur, dict) or key not in cur: return None cur = cur[key] return cur def _as_float(x: Any): try: return float(x) except Exception: return None def _load_types_local() -> dict: p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") try: with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} except Exception: return {} def _effective_chunk_profile(note_type: str, fm: Dict[str, Any], reg: dict) -> Optional[str]: if isinstance(fm.get("chunk_profile"), str): return fm.get("chunk_profile") types = reg.get("types") if isinstance(reg.get("types"), dict) else reg if isinstance(types, dict): v = types.get(note_type, {}) if isinstance(v, dict): cp = v.get("chunk_profile") if isinstance(cp, str): return cp return None def _effective_retriever_weight(note_type: str, fm: Dict[str, Any], reg: dict) -> float: if fm.get("retriever_weight") is not None: v = _as_float(fm.get("retriever_weight")) if v is not None: return float(v) types = reg.get("types") if isinstance(reg.get("types"), dict) else reg candidates = [ f"{note_type}.retriever_weight", f"{note_type}.retriever.weight", f"{note_type}.retrieval.weight", "defaults.retriever_weight", "defaults.retriever.weight", "global.retriever_weight", "global.retriever.weight", ] for path in candidates: val = _deep_get(types, path) if "." in path else (types.get(path) if isinstance(types, dict) else None) if val is None and isinstance(reg, dict): val = _deep_get(reg, f"types.{path}") v = _as_float(val) if v is not None: return float(v) return 1.0 def _as_list(x): if x is None: return [] if isinstance(x, list): return x return [x] def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], *, note_text: str = "", types_cfg: Optional[dict] = None, file_path: Optional[str] = None) -> List[Dict[str, Any]]: fm = (note or {}).get("frontmatter", {}) or {} note_type = fm.get("type") or note.get("type") or "concept" reg = types_cfg if isinstance(types_cfg, dict) else _load_types_local() cp = _effective_chunk_profile(note_type, fm, reg) rw = _effective_retriever_weight(note_type, fm, reg) tags = fm.get("tags") or [] if isinstance(tags, str): tags = [tags] out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): # Attribute oder Keys (Chunk-Objekt oder Dict) cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "") window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text) prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None) next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None) pl: Dict[str, Any] = { "note_id": nid, "chunk_id": cid, "index": int(index), "ord": int(index) + 1, "type": note_type, "tags": tags, "text": text, "window": window, "neighbors_prev": _as_list(prev_id), "neighbors_next": _as_list(next_id), "section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""), "path": note_path, "source_path": file_path or note_path, "retriever_weight": float(rw), } if cp is not None: pl["chunk_profile"] = cp # Aufräumen for alias in ("chunk_num", "Chunk_Number"): pl.pop(alias, None) out.append(pl) return out