mindnet/app/core/chunk_payload.py

"""
chunk_payload.py — v1.4.2
-------------------------
Robuste, abwärtskompatible Payload-Erzeugung für Chunks.

Ziele
- Setzt pro Chunk `text`, `retriever_weight`, `chunk_profile`, `note_id`.
- Akzeptiert ParsedNote-Objekte *oder* Dicts, inklusive bereits vorsegmentierter .chunks.
- Verträgt zusätzliche args/kwargs (kompatibel zu älteren Aufrufern).
- Konfig-Auflösung identisch zu note_payload.py.

Autor: ChatGPT
Lizenz: MIT
"""
from __future__ import annotations

import os
import hashlib
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

try:
    import yaml  # type: ignore
except Exception:  # pragma: no cover
    yaml = None  # type: ignore


def _as_dict(note: Any) -> Dict[str, Any]:
    if isinstance(note, dict):
        return dict(note)
    out: Dict[str, Any] = {}
    for attr in ("note_id", "id", "title", "type", "frontmatter", "meta", "body", "text", "content", "path", "chunks"):
        if hasattr(note, attr):
            out[attr] = getattr(note, attr)
    if hasattr(note, "__dict__"):
        for k, v in note.__dict__.items():
            if k not in out:
                out[k] = v
    return out


def _load_types_config(search_root: Optional[Union[str, Path]] = None,
                       preloaded: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    if isinstance(preloaded, dict) and "types" in preloaded:
        return preloaded

    candidates: List[Path] = []
    if search_root:
        root = Path(search_root)
        candidates.extend([root / "config.yaml", root / "config" / "config.yaml", root / "config" / "types.yaml"])
    cwd = Path.cwd()
    candidates.extend([cwd / "config.yaml", cwd / "config" / "config.yaml", cwd / "config" / "types.yaml"])

    for p in candidates:
        if p.exists() and p.is_file():
            if yaml is None:
                break
            try:
                data = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
                if isinstance(data, dict) and "types" in data:
                    return data
            except Exception:
                pass
    return {"version": "1.0", "types": {}}


def _safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
    if not isinstance(d, dict):
        return default
    return d.get(key, default)


def _resolve_type(note_d: Dict[str, Any]) -> str:
    fm = note_d.get("frontmatter") or {}
    t = _safe_get(fm, "type") or note_d.get("type")
    if not t and isinstance(note_d.get("meta"), dict):
        t = note_d["meta"].get("type")
    return str(t or "concept")


def _resolve_note_id(note_d: Dict[str, Any]) -> Optional[str]:
    for k in ("note_id", "id"):
        v = note_d.get(k)
        if isinstance(v, str) and v:
            return v
    return None


def _resolve_body(note_d: Dict[str, Any]) -> str:
    for k in ("body", "text", "content"):
        v = note_d.get(k)
        if isinstance(v, str) and v.strip():
            return v
    return ""


def _resolve_defaults_for_type(types_cfg: Dict[str, Any], typ: str) -> Dict[str, Any]:
    if not isinstance(types_cfg, dict):
        return {}
    t = (types_cfg.get("types") or {}).get(typ) or {}
    return t if isinstance(t, dict) else {}


def _coerce_float(val: Any, default: float) -> float:
    try:
        if val is None:
            return default
        if isinstance(val, (int, float)):
            return float(val)
        if isinstance(val, str):
            return float(val.strip())
    except Exception:
        pass
    return default


def _compute_retriever_weight(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> float:
    fm = note_d.get("frontmatter") or {}
    if "retriever_weight" in fm:
        return _coerce_float(fm.get("retriever_weight"), 1.0)
    tdef = _resolve_defaults_for_type(types_cfg, typ)
    if "retriever_weight" in tdef:
        return _coerce_float(tdef.get("retriever_weight"), 1.0)
    envv = os.getenv("MINDNET_DEFAULT_RETRIEVER_WEIGHT")
    if envv:
        return _coerce_float(envv, 1.0)
    return 1.0


def _compute_chunk_profile(note_d: Dict[str, Any], types_cfg: Dict[str, Any], typ: str) -> str:
    fm = note_d.get("frontmatter") or {}
    if "chunk_profile" in fm:
        return str(fm.get("chunk_profile"))
    tdef = _resolve_defaults_for_type(types_cfg, typ)
    if "chunk_profile" in tdef:
        return str(tdef.get("chunk_profile"))
    envv = os.getenv("MINDNET_DEFAULT_CHUNK_PROFILE")
    if envv:
        return str(envv)
    return "medium"


def _norm_chunk_text(s: Any) -> str:
    if isinstance(s, str):
        return s.strip()
    return ""


def _hash(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]


def make_chunk_payloads(note: Any, *args, **kwargs) -> List[Dict[str, Any]]:
    """Erzeugt Payloads für alle Chunks der Note.

    Akzeptierte zusätzliche kwargs:
      - types_config: dict wie in config.yaml
      - search_root / vault_root: für Konfigsuche

    *args werden ignoriert (Kompatibilität zu älteren Aufrufern).
    """
    note_d = _as_dict(note)

    types_config = kwargs.get("types_config")
    search_root = kwargs.get("search_root") or kwargs.get("vault_root")
    types_cfg = _load_types_config(search_root, types_config)

    typ = _resolve_type(note_d)
    note_id = _resolve_note_id(note_d) or ""

    r_weight = _compute_retriever_weight(note_d, types_cfg, typ)
    c_profile = _compute_chunk_profile(note_d, types_cfg, typ)

    out: List[Dict[str, Any]] = []

    # 1) Falls der Parser bereits Chunks liefert, nutzen
    pre = note_d.get("chunks")
    if isinstance(pre, list) and pre:
        for idx, c in enumerate(pre):
            if isinstance(c, dict):
                text = _norm_chunk_text(c.get("text") or c.get("body") or c.get("content"))
            else:
                text = _norm_chunk_text(getattr(c, "text", ""))
            if not text:
                # Fallback auf Note-Body, falls leer
                text = _resolve_body(note_d)
            if not text:
                continue

            chunk_id = f"{note_id}#{idx:03d}" if note_id else _hash(text)[:8]
            payload = {
                "note_id": note_id,
                "chunk_id": chunk_id,
                "text": text,
                "retriever_weight": float(r_weight),
                "chunk_profile": str(c_profile),
                "type": typ,
            }
            out.append(payload)

    # 2) Sonst als Single-Chunk aus Body/Text
    if not out:
        text = _resolve_body(note_d)
        if text:
            chunk_id = f"{note_id}#000" if note_id else _hash(text)[:8]
            out.append({
                "note_id": note_id,
                "chunk_id": chunk_id,
                "text": text,
                "retriever_weight": float(r_weight),
                "chunk_profile": str(c_profile),
                "type": typ,
            })

    return out