""" chunk_payload.py — v1.4.2 ------------------------- Robuste, abwärtskompatible Payload-Erzeugung für Chunks. Ziele - Setzt pro Chunk `text`, `retriever_weight`, `chunk_profile`, `note_id`. - Akzeptiert ParsedNote-Objekte *oder* Dicts, inklusive bereits vorsegmentierter .chunks. - Verträgt zusätzliche args/kwargs (kompatibel zu älteren Aufrufern). - Konfig-Auflösung identisch zu note_payload.py. Autor: ChatGPT Lizenz: MIT """ from __future__ import annotations import os import hashlib from pathlib import Path from typing import Any, Dict, List, Optional, Union try: import yaml # type: ignore except Exception: # pragma: no cover yaml = None # type: ignore def _as_dict(note: Any) -> Dict[str, Any]: if isinstance(note, dict): return dict(note) out: Dict[str, Any] = {} for attr in ("note_id", "id", "title", "type", "frontmatter", "meta", "body", "text", "content", "path", "chunks"): if hasattr(note, attr): out[attr] = getattr(note, attr) if hasattr(note, "__dict__"): for k, v in note.__dict__.items(): if k not in out: out[k] = v return out def _load_types_config(search_root: Optional[Union[str, Path]] = None, preloaded: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: if isinstance(preloaded, dict) and "types" in preloaded: return preloaded candidates: List[Path] = [] if search_root: root = Path(search_root) candidates.extend([root / "config.yaml", root / "config" / "config.yaml", root / "config" / "types.yaml"]) cwd = Path.cwd() candidates.extend([cwd / "config.yaml", cwd / "config" / "config.yaml", cwd / "config" / "types.yaml"]) for p in candidates: if p.exists() and p.is_file(): if yaml is None: break try: data = yaml.safe_load(p.read_text(encoding="utf-8")) or {} if isinstance(data, dict) and "types" in data: return data except Exception: pass return {"version": "1.0", "types": {}} def _safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any: if not isinstance(d, dict): return default return d.get(key, default) def _resolve_type(note_d: Dict[str, Any]) -> str: fm = note_d.get("frontmatter") or {} t = _safe_get(fm, "type") or note_d.get("type") if not t and isinstance(note_d.get("meta"), dict): t = note_d["meta"].get("type") return str(t or "concept") def _resolve_note_id(note_d: Dict[str, Any]) -> Optional[str]: for k in ("note_id", "id"): v = note_d.get(k) if isinstance(v, str) and v: return v return None def _resolve_body(note_d: Dict[str, Any]) -> str: for k in ("body", "text", "content"): v = note_d.get(k) if isinstance(v, str) and v.strip(): return v return "" def _resolve_defaults_for_type(types_cfg: Dict[str, Any], typ: str) -> Dict[str, Any]: if not isinstance(types_cfg, dict): return {} t = (types_cfg.get("types") or {}).get(typ) or {} return t if isinstance(t, dict) else {} def _coerce_float(val: Any, default: float) -> float: try: if val is None: return default if isinstance(val, (int, float)): return float(val) if isinstance(val, str): return float(val.strip()) except Exception: pass return default def _compute_retriever_weight(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> float: fm = note_d.get("frontmatter") or {} if "retriever_weight" in fm: return _coerce_float(fm.get("retriever_weight"), 1.0) tdef = _resolve_defaults_for_type(types_cfg, typ) if "retriever_weight" in tdef: return _coerce_float(tdef.get("retriever_weight"), 1.0) envv = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT") if envv: return _coerce_float(envv, 1.0) return 1.0 def _compute_chunk_profile(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> str: fm = note_d.get("frontmatter") or {} if "chunk_profile" in fm: return str(fm.get("chunk_profile")) tdef = _resolve_defaults_for_type(types_cfg, typ) if "chunk_profile" in tdef: return str(tdef.get("chunk_profile")) envv = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE") if envv: return str(envv) return "medium" def _norm_chunk_text(s: Any) -> str: if isinstance(s, str): return s.strip() return "" def _hash(s: str) -> str: return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12] def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]: """Erzeugt Payloads für alle Chunks der Note. Akzeptierte zusätzliche kwargs: - types_config: dict wie in config.yaml - search_root / vault_root: für Konfigsuche *args werden ignoriert (Kompatibilität zu älteren Aufrufern). """ note_d = _as_dict(note) types_config = kwargs.get("types_config") search_root = kwargs.get("search_root") or kwargs.get("vault_root") types_cfg = _load_types_config(search_root, types_config) typ = _resolve_type(note_d) note_id = _resolve_note_id(note_d) or "" r_weight = _compute_retriever_weight(note_d, types_cfg, typ) c_profile = _compute_chunk_profile(note_d, types_cfg, typ) out: List[Dict[str, Any]] = [] # 1) Falls der Parser bereits Chunks liefert, nutzen pre = note_d.get("chunks") if isinstance(pre, list) and pre: for idx, c in enumerate(pre): if isinstance(c, dict): text = _norm_chunk_text(c.get("text") or c.get("body") or c.get("content")) else: text = _norm_chunk_text(getattr(c, "text", "")) if not text: # Fallback auf Note-Body, falls leer text = _resolve_body(note_d) if not text: continue chunk_id = f"{note_id}#{idx:03d}" if note_id else _hash(text)[:8] payload = { "note_id": note_id, "chunk_id": chunk_id, "text": text, "retriever_weight": float(r_weight), "chunk_profile": str(c_profile), "type": typ, } out.append(payload) # 2) Sonst als Single-Chunk aus Body/Text if not out: text = _resolve_body(note_d) if text: chunk_id = f"{note_id}#000" if note_id else _hash(text)[:8] out.append({ "note_id": note_id, "chunk_id": chunk_id, "text": text, "retriever_weight": float(r_weight), "chunk_profile": str(c_profile), "type": typ, }) return out