""" FILE: app/core/chunk_payload.py DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. FEATURES: - Inkludiert Nachbarschafts-IDs (prev/next) und Titel. - FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen. VERSION: 2.3.0 STATUS: Active DEPENDENCIES: yaml, os EXTERNAL_CONFIG: config/types.yaml """ from __future__ import annotations from typing import Any, Dict, List, Optional import os, yaml def _env(n: str, d: Optional[str]=None) -> str: v = os.getenv(n) return v if v is not None else (d or "") def _load_types() -> dict: p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") try: with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} except Exception: return {} def _get_types_map(reg: dict) -> dict: if isinstance(reg, dict) and isinstance(reg.get("types"), dict): return reg["types"] return reg if isinstance(reg, dict) else {} def _get_defaults(reg: dict) -> dict: if isinstance(reg, dict) and isinstance(reg.get("defaults"), dict): return reg["defaults"] if isinstance(reg, dict) and isinstance(reg.get("global"), dict): return reg["global"] return {} def _as_float(x: Any): try: return float(x) except Exception: return None def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]: # 1. Type Level types = _get_types_map(reg) if isinstance(types, dict): t = types.get(note_type, {}) if isinstance(t, dict): cp = t.get("chunking_profile") or t.get("chunk_profile") if isinstance(cp, str) and cp: return cp # 2. Defaults Level defs = _get_defaults(reg) if isinstance(defs, dict): cp = defs.get("chunking_profile") or defs.get("chunk_profile") if isinstance(cp, str) and cp: return cp return None def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float: """ Liest Weight nur aus Config (Type > Default). Wird aufgerufen, wenn im Frontmatter nichts steht. """ # 1. Type Level types = _get_types_map(reg) if isinstance(types, dict): t = types.get(note_type, {}) if isinstance(t, dict) and (t.get("retriever_weight") is not None): v = _as_float(t.get("retriever_weight")) if v is not None: return float(v) # 2. Defaults Level defs = _get_defaults(reg) if isinstance(defs, dict) and (defs.get("retriever_weight") is not None): v = _as_float(defs.get("retriever_weight")) if v is not None: return float(v) return 1.0 def _as_list(x): if x is None: return [] if isinstance(x, list): return x return [x] def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], *, note_text: str = "", types_cfg: Optional[dict] = None, file_path: Optional[str] = None) -> List[Dict[str, Any]]: """ Erstellt die Payloads für die Chunks. Argument 'note' kann sein: A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests) B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py) """ # --- FIX 3: Intelligente Erkennung der Input-Daten --- # Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'? if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict): # Fall A: Container (wir müssen auspacken) fm = note["frontmatter"] else: # Fall B: Direktes Dict (so ruft ingestion.py es auf!) fm = note or {} note_type = fm.get("type") or note.get("type") or "concept" # Title Extraction (Fallback Chain) title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled" reg = types_cfg if isinstance(types_cfg, dict) else _load_types() # --- Profil-Ermittlung --- # Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups: cp = fm.get("chunking_profile") or fm.get("chunk_profile") if not cp: cp = _resolve_chunk_profile_from_config(note_type, reg) if not cp: cp = "sliding_standard" # --- Retriever Weight Ermittlung --- rw = fm.get("retriever_weight") if rw is None: rw = _resolve_retriever_weight_from_config(note_type, reg) try: rw = float(rw) except Exception: rw = 1.0 tags = fm.get("tags") or [] if isinstance(tags, str): tags = [tags] out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): # Attribute extrahieren cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "") window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text) prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None) next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None) pl: Dict[str, Any] = { "note_id": nid, "chunk_id": cid, "title": title, "index": int(index), "ord": int(index) + 1, "type": note_type, "tags": tags, "text": text, "window": window, "neighbors_prev": _as_list(prev_id), "neighbors_next": _as_list(next_id), "section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""), "path": note_path, "source_path": file_path or note_path, "retriever_weight": float(rw), "chunk_profile": cp, # Jetzt endlich mit dem Override-Wert! } # Cleanup for alias in ("chunk_num", "Chunk_Number"): pl.pop(alias, None) out.append(pl) return out