""" FILE: app/core/chunking/chunking_utils.py DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration. """ import math import yaml import logging from pathlib import Path from typing import Dict, Any, Tuple, Optional logger = logging.getLogger(__name__) BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent CONFIG_PATH = BASE_DIR / "config" / "types.yaml" DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)} _CONFIG_CACHE = None def load_yaml_config() -> Dict[str, Any]: global _CONFIG_CACHE if _CONFIG_CACHE is not None: return _CONFIG_CACHE if not CONFIG_PATH.exists(): return {} try: with open(CONFIG_PATH, "r", encoding="utf-8") as f: data = yaml.safe_load(f) _CONFIG_CACHE = data return data except Exception: return {} def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """ Lädt die Chunking-Strategie basierend auf dem Note-Type. WP-24c v4.2.5: Frontmatter-Override für chunking_profile hat höchste Priorität. Args: note_type: Der Typ der Note (z.B. "decision", "experience") frontmatter: Optionales Frontmatter-Dict mit chunking_profile Override Returns: Dict mit Chunking-Konfiguration """ full_config = load_yaml_config() profiles = full_config.get("chunking_profiles", {}) type_def = full_config.get("types", {}).get(note_type.lower(), {}) # WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults profile_name = None if frontmatter and "chunking_profile" in frontmatter: profile_name = frontmatter.get("chunking_profile") or frontmatter.get("chunk_profile") if not profile_name: profile_name = type_def.get("chunking_profile") if not profile_name: profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard") config = profiles.get(profile_name, DEFAULT_PROFILE).copy() if "overlap" in config and isinstance(config["overlap"], list): config["overlap"] = tuple(config["overlap"]) return config def estimate_tokens(text: str) -> int: """Grobe Schätzung der Token-Anzahl.""" return max(1, math.ceil(len(text.strip()) / 4)) def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]: """Trennt YAML-Frontmatter vom Text.""" import re fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL) if not fm_match: return {}, md_text try: frontmatter = yaml.safe_load(fm_match.group(1)) if not isinstance(frontmatter, dict): frontmatter = {} except Exception: frontmatter = {} text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL) return frontmatter, text_without_fm.strip()