From b3833f2051228d1ea12946a13e6143f8b6912ea3 Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 16 Dec 2025 15:50:24 +0100 Subject: [PATCH] chunk_payload nimmt auch das richtige chunking_profile --- app/core/chunk_payload.py | 80 +++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index 5cc3000..d864b82 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -1,11 +1,13 @@ """ FILE: app/core/chunk_payload.py -DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. Inkludiert Nachbarschafts-IDs (prev/next) und Titel. -VERSION: 2.0.0 +DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. +FEATURES: + - Inkludiert Nachbarschafts-IDs (prev/next) und Titel. + - FIX: Korrektes Auslesen von 'chunking_profile' (Frontmatter > Type > Default). +VERSION: 2.1.0 STATUS: Active DEPENDENCIES: yaml, os EXTERNAL_CONFIG: config/types.yaml -LAST_ANALYSIS: 2025-12-15 """ from __future__ import annotations from typing import Any, Dict, List, Optional @@ -36,21 +38,32 @@ def _get_defaults(reg: dict) -> dict: return {} def _as_float(x: Any): - try: - return float(x) - except Exception: - return None + try: return float(x) + except Exception: return None -def _resolve_chunk_profile(note_type: str, reg: dict) -> str: +def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]: + """ + Liest das Profil aus der Config (Type > Default). + Prüft 'chunking_profile' UND 'chunk_profile'. + """ + # 1. Type Level types = _get_types_map(reg) if isinstance(types, dict): t = types.get(note_type, {}) - if isinstance(t, dict) and isinstance(t.get("chunk_profile"), str): - return t["chunk_profile"] + if isinstance(t, dict): + # Prüfe beide Schreibweisen + cp = t.get("chunking_profile") or t.get("chunk_profile") + if isinstance(cp, str) and cp: + return cp + + # 2. Defaults Level defs = _get_defaults(reg) - if isinstance(defs, dict) and isinstance(defs.get("chunk_profile"), str): - return defs["chunk_profile"] - return "default" + if isinstance(defs, dict): + cp = defs.get("chunking_profile") or defs.get("chunk_profile") + if isinstance(cp, str) and cp: + return cp + + return None def _resolve_retriever_weight(note_type: str, reg: dict) -> float: types = _get_types_map(reg) @@ -58,20 +71,16 @@ def _resolve_retriever_weight(note_type: str, reg: dict) -> float: t = types.get(note_type, {}) if isinstance(t, dict) and (t.get("retriever_weight") is not None): v = _as_float(t.get("retriever_weight")) - if v is not None: - return float(v) + if v is not None: return float(v) defs = _get_defaults(reg) if isinstance(defs, dict) and (defs.get("retriever_weight") is not None): v = _as_float(defs.get("retriever_weight")) - if v is not None: - return float(v) + if v is not None: return float(v) return 1.0 def _as_list(x): - if x is None: - return [] - if isinstance(x, list): - return x + if x is None: return [] + if isinstance(x, list): return x return [x] def make_chunk_payloads(note: Dict[str, Any], @@ -81,17 +90,29 @@ def make_chunk_payloads(note: Dict[str, Any], note_text: str = "", types_cfg: Optional[dict] = None, file_path: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Erstellt die Payloads für die Chunks. + """ fm = (note or {}).get("frontmatter", {}) or {} note_type = fm.get("type") or note.get("type") or "concept" - # WP-11 FIX: Title Extraction für Discovery Service - # Wir holen den Titel aus Frontmatter oder Fallback ID/Untitled + # Title Extraction title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled" reg = types_cfg if isinstance(types_cfg, dict) else _load_types() - # types.yaml authoritative - cp = _resolve_chunk_profile(note_type, reg) + # --- FIX: Profil-Ermittlung --- + # 1. Frontmatter (Override) + cp = fm.get("chunking_profile") or fm.get("chunk_profile") + + # 2. Config (Type / Default) + if not cp: + cp = _resolve_chunk_profile_from_config(note_type, reg) + + # 3. Hard Fallback + if not cp: + cp = "sliding_standard" # Statt "default" + rw = _resolve_retriever_weight(note_type, reg) tags = fm.get("tags") or [] @@ -100,7 +121,7 @@ def make_chunk_payloads(note: Dict[str, Any], out: List[Dict[str, Any]] = [] for idx, ch in enumerate(chunks_from_chunker): - # Attribute oder Keys (Chunk-Objekt oder Dict) + # Attribute sicher extrahieren cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) @@ -112,7 +133,7 @@ def make_chunk_payloads(note: Dict[str, Any], pl: Dict[str, Any] = { "note_id": nid, "chunk_id": cid, - "title": title, # <--- HIER: Titel in Payload einfügen + "title": title, "index": int(index), "ord": int(index) + 1, "type": note_type, @@ -125,9 +146,10 @@ def make_chunk_payloads(note: Dict[str, Any], "path": note_path, "source_path": file_path or note_path, "retriever_weight": float(rw), - "chunk_profile": cp, + "chunk_profile": cp, # Jetzt korrekt } - # Aufräumen von Alt-Feldern + + # Cleanup for alias in ("chunk_num", "Chunk_Number"): pl.pop(alias, None)