#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ app/core/chunk_payload.py (Mindnet V2 — robust) Aufgabe ------- Erzeugt Chunk-Payloads aus den vom Chunker gelieferten "Chunk"-Objekten. - Spiegelt `retriever_weight` und `chunk_profile` in **jedem** Chunk-Payload. - Werteauflösung: Frontmatter > types.yaml > Defaults. - Lädt `config/types.yaml` selbst, wenn `types_cfg` nicht übergeben wurde. Eingang ------- - note: Dict mit mind. { frontmatter: {...}, id, type, title, path } - note_path: Pfad der Note (für Payload-Feld `path`) - chunks_from_chunker: Liste von Objekten mit Attributen/Feldern: id, note_id, index, text, window, neighbors_prev, neighbors_next - note_text: voller Text der Note (optional, kann leer sein) - types_cfg: optional; wenn None → config wird intern geladen - file_path: optional, für Debug/Tracing im Payload Ausgang (pro Chunk) ------------------- - Pflichtfelder: note_id, chunk_id, index (0-basiert), ord (1-basiert), type, tags - Texte: text, window - Nachbarn: neighbors_prev, neighbors_next - Spiegelungen: retriever_weight, chunk_profile - Meta: source_path, path, section (leer), created/updated opt. aus Frontmatter """ from __future__ import annotations from typing import Any, Dict, List, Optional import os, yaml def _env(n: str, d: Optional[str]=None) -> str: v = os.getenv(n) return v if v is not None else (d or "") def _deep_get(root: Any, path: str) -> Any: cur = root for key in path.split("."): if not isinstance(cur, dict) or key not in cur: return None cur = cur[key] return cur def _as_float(x: Any): try: return float(x) except Exception: return None def _load_types_local() -> dict: p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") try: with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} except Exception: return {} def _effective_chunk_profile(note_type: str, fm: Dict[str, Any], reg: dict) -> Optional[str]: # Frontmatter zuerst if isinstance(fm.get("chunk_profile"), str): return fm.get("chunk_profile") # Registry types = reg.get("types") if isinstance(reg.get("types"), dict) else reg if isinstance(types, dict): v = types.get(note_type, {}) if isinstance(v, dict): cp = v.get("chunk_profile") if isinstance(cp, str): return cp return None def _effective_retriever_weight(note_type: str, fm: Dict[str, Any], reg: dict) -> float: # Frontmatter zuerst if fm.get("retriever_weight") is not None: v = _as_float(fm.get("retriever_weight")) if v is not None: return float(v) # Registry-Pfade types = reg.get("types") if isinstance(reg.get("types"), dict) else reg candidates = [ f"{note_type}.retriever_weight", f"{note_type}.retriever.weight", f"{note_type}.retrieval.weight", "defaults.retriever_weight", "defaults.retriever.weight", "global.retriever_weight", "global.retriever.weight", ] for path in candidates: # Wenn types == reg-root (flatten), erlauben sowohl "types.X" als auch "X" val = _deep_get(types, path) if "." in path else (types.get(path) if isinstance(types, dict) else None) if val is None and isinstance(reg, dict): # versuche absolute Pfade val = _deep_get(reg, f"types.{path}") v = _as_float(val) if v is not None: return float(v) return 1.0 def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], *, note_text: str = "", types_cfg: Optional[dict] = None, file_path: Optional[str] = None) -> List[Dict[str, Any]]: fm = (note or {}).get("frontmatter", {}) note_type = fm.get("type") or note.get("type") or "concept" reg = types_cfg if isinstance(types_cfg, dict) else _load_types_local() # Effektive Werte bestimmen cp = _effective_chunk_profile(note_type, fm, reg) rw = _effective_retriever_weight(note_type, fm, reg) tags = fm.get("tags") or [] if isinstance(tags, str): tags = [tags] out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): # Chunk-Grunddaten (Attribute oder Keys) cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "") window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text) prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None) next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None) pl: Dict[str, Any] = { "note_id": nid, "chunk_id": cid, "index": int(index), "ord": int(index) + 1, "type": note_type, "tags": tags, "text": text, "window": window, "neighbors_prev": prev_id, "neighbors_next": next_id, "section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""), "path": note_path, "source_path": file_path or note_path, "retriever_weight": float(rw), } if cp is not None: pl["chunk_profile"] = cp # Aufräumen: keine historischen Aliasfelder for alias in ("chunk_num", "Chunk_Number"): if alias in pl: pl.pop(alias, None) out.append(pl) return out