mindnet/app/core/chunking/chunking_utils.py

"""
FILE: app/core/chunking/chunking_utils.py
DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration.
"""
import math
import yaml
import logging
from pathlib import Path
from typing import Dict, Any, Tuple, Optional

logger = logging.getLogger(__name__)

BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}

_CONFIG_CACHE = None

def load_yaml_config() -> Dict[str, Any]:
    global _CONFIG_CACHE
    if _CONFIG_CACHE is not None: return _CONFIG_CACHE
    if not CONFIG_PATH.exists(): return {}
    try:
        with open(CONFIG_PATH, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)
            _CONFIG_CACHE = data
            return data
    except Exception: return {}

def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Lädt die Chunking-Strategie basierend auf dem Note-Type.
    WP-24c v4.2.5: Frontmatter-Override für chunking_profile hat höchste Priorität.

    Args:
        note_type: Der Typ der Note (z.B. "decision", "experience")
        frontmatter: Optionales Frontmatter-Dict mit chunking_profile Override

    Returns:
        Dict mit Chunking-Konfiguration
    """
    full_config = load_yaml_config()
    profiles = full_config.get("chunking_profiles", {})
    type_def = full_config.get("types", {}).get(note_type.lower(), {})

    # WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
    profile_name = None
    if frontmatter and "chunking_profile" in frontmatter:
        profile_name = frontmatter.get("chunking_profile") or frontmatter.get("chunk_profile")
    if not profile_name:
        profile_name = type_def.get("chunking_profile")
    if not profile_name:
        profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")

    config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
    if "overlap" in config and isinstance(config["overlap"], list):
        config["overlap"] = tuple(config["overlap"])
    return config

def estimate_tokens(text: str) -> int:
    """Grobe Schätzung der Token-Anzahl."""
    return max(1, math.ceil(len(text.strip()) / 4))

def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
    """Trennt YAML-Frontmatter vom Text."""
    import re
    fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
    if not fm_match: return {}, md_text
    try:
        frontmatter = yaml.safe_load(fm_match.group(1))
        if not isinstance(frontmatter, dict): frontmatter = {}
    except Exception: frontmatter = {}
    text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
    return frontmatter, text_without_fm.strip()