mindnet/app/core/chunking/chunking_utils.py

55 lines
2.1 KiB
Python

"""
FILE: app/core/chunking/chunking_utils.py
DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration.
"""
import math
import yaml
import logging
from pathlib import Path
from typing import Dict, Any, Tuple
logger = logging.getLogger(__name__)
BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
_CONFIG_CACHE = None
def load_yaml_config() -> Dict[str, Any]:
global _CONFIG_CACHE
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
if not CONFIG_PATH.exists(): return {}
try:
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
_CONFIG_CACHE = data
return data
except Exception: return {}
def get_chunk_config(note_type: str) -> Dict[str, Any]:
"""Lädt die Chunking-Strategie basierend auf dem Note-Type."""
full_config = load_yaml_config()
profiles = full_config.get("chunking_profiles", {})
type_def = full_config.get("types", {}).get(note_type.lower(), {})
profile_name = type_def.get("chunking_profile") or full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
if "overlap" in config and isinstance(config["overlap"], list):
config["overlap"] = tuple(config["overlap"])
return config
def estimate_tokens(text: str) -> int:
"""Grobe Schätzung der Token-Anzahl."""
return max(1, math.ceil(len(text.strip()) / 4))
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
"""Trennt YAML-Frontmatter vom Text."""
import re
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
if not fm_match: return {}, md_text
try:
frontmatter = yaml.safe_load(fm_match.group(1))
if not isinstance(frontmatter, dict): frontmatter = {}
except Exception: frontmatter = {}
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
return frontmatter, text_without_fm.strip()