# chunk_payload.py """ Mindnet - Chunk Payload Builder Version: 1.4.3 Beschreibung: - Robust gegenüber alten/neuen Aufrufsignaturen (toleriert *args, **kwargs). - Liest Typ-Defaults aus ./config/config.yaml oder ./config/types.yaml. - Baut Chunks aus vorhandenen note.chunks (falls vorhanden) oder fällt auf eine einfache, profilabhängige Absatzbündelung zurück. - Setzt in jedem Chunk-Payload: - note_id, chunk_id (deterministisch), index, title, type, path - text (nie leer), retriever_weight, chunk_profile - Garantiert JSON-serialisierbare Payloads. """ from __future__ import annotations from typing import Any, Dict, List, Optional import os import json import pathlib import re import yaml import hashlib def _as_dict(note: Any) -> Dict[str, Any]: if isinstance(note, dict): return note d: Dict[str, Any] = {} for attr in ( "id", "note_id", "title", "path", "frontmatter", "meta", "body", "text", "type", "chunks", ): if hasattr(note, attr): d[attr] = getattr(note, attr) if "frontmatter" not in d and hasattr(note, "metadata"): d["frontmatter"] = getattr(note, "metadata") return d def _load_types_config(explicit: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: if isinstance(explicit, dict): return explicit for rel in ("config/config.yaml", "config/types.yaml"): p = pathlib.Path(rel) if p.exists(): with p.open("r", encoding="utf-8") as f: data = yaml.safe_load(f) or {} if isinstance(data, dict) and "types" in data and isinstance(data["types"], dict): return data["types"] return data if isinstance(data, dict) else {} return {} def _get_front(n: Dict[str, Any]) -> Dict[str, Any]: fm = n.get("frontmatter") or n.get("meta") or {} return fm if isinstance(fm, dict) else {} def _coalesce(*vals): for v in vals: if v is not None: return v return None def _body(n: Dict[str, Any]) -> str: b = n.get("body") if isinstance(b, str): return b t = n.get("text") return t if isinstance(t, str) else "" def _iter_chunks(n: Dict[str, Any], profile: str) -> List[Dict[str, Any]]: # 1) Bereits vorhandene Chunks bevorzugen existing = n.get("chunks") if isinstance(existing, list) and existing: out: List[Dict[str, Any]] = [] for i, c in enumerate(existing): if isinstance(c, dict): text = c.get("text") or "" else: text = str(c) if c is not None else "" if not text: continue out.append({"index": i, "text": text}) if out: return out # 2) Fallback: naive, profilabhängige Absatz-Bündelung size = {"short": 600, "medium": 1200, "long": 2400}.get(str(profile), 1200) text = _body(n) if not text: return [] paras = re.split(r"\n{2,}", text) chunks: List[str] = [] buf = "" for p in paras: p = (p or "").strip() if not p: continue if len(buf) + (2 if buf else 0) + len(p) <= size: buf = (buf + "\n\n" + p).strip() if buf else p else: if buf: chunks.append(buf) if len(p) <= size: buf = p else: for i in range(0, len(p), size): chunks.append(p[i : i + size]) buf = "" if buf: chunks.append(buf) return [{"index": i, "text": c} for i, c in enumerate(chunks)] def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]: """ Build payloads for chunks. Tolerates legacy positional arguments. Returns list[dict] (ein Payload pro Chunk). """ n = _as_dict(note) types_cfg = kwargs.get("types_config") or (args[0] if args else None) types_cfg = _load_types_config(types_cfg) fm = _get_front(n) note_type = str(fm.get("type") or n.get("type") or "note") cfg_for_type = types_cfg.get(note_type, {}) if isinstance(types_cfg, dict) else {} try: default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)) except Exception: default_rw = 1.0 retriever_weight = _coalesce( fm.get("retriever_weight"), cfg_for_type.get("retriever_weight"), default_rw, ) try: retriever_weight = float(retriever_weight) except Exception: retriever_weight = default_rw chunk_profile = _coalesce( fm.get("chunk_profile"), cfg_for_type.get("chunk_profile"), os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"), ) if not isinstance(chunk_profile, str): chunk_profile = "medium" note_id = n.get("note_id") or n.get("id") or fm.get("id") title = n.get("title") or fm.get("title") or "" path = n.get("path") if isinstance(path, pathlib.Path): path = str(path) chunks = _iter_chunks(n, chunk_profile) payloads: List[Dict[str, Any]] = [] for c in chunks: idx = c.get("index", len(payloads)) text = c.get("text") if isinstance(c, dict) else (str(c) if c is not None else "") if not isinstance(text, str): text = str(text or "") # deterministische chunk_id key = f"{note_id}|{idx}" h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:12] chunk_id = f"{note_id}-{idx:03d}-{h}" if note_id else h payload = { "note_id": note_id, "chunk_id": chunk_id, "index": idx, "title": title, "type": note_type, "path": path, "text": text, "retriever_weight": retriever_weight, "chunk_profile": chunk_profile, } # JSON-Serialisierbarkeit sicherstellen json.loads(json.dumps(payload, ensure_ascii=False)) payloads.append(payload) return payloads