74 lines
2.8 KiB
Python
74 lines
2.8 KiB
Python
"""
|
|
FILE: app/core/chunking/chunking_utils.py
|
|
DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration.
|
|
"""
|
|
import math
|
|
import yaml
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Tuple, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
|
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
|
|
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
|
|
|
|
_CONFIG_CACHE = None
|
|
|
|
def load_yaml_config() -> Dict[str, Any]:
|
|
global _CONFIG_CACHE
|
|
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
|
|
if not CONFIG_PATH.exists(): return {}
|
|
try:
|
|
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
_CONFIG_CACHE = data
|
|
return data
|
|
except Exception: return {}
|
|
|
|
def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
"""
|
|
Lädt die Chunking-Strategie basierend auf dem Note-Type.
|
|
WP-24c v4.2.5: Frontmatter-Override für chunking_profile hat höchste Priorität.
|
|
|
|
Args:
|
|
note_type: Der Typ der Note (z.B. "decision", "experience")
|
|
frontmatter: Optionales Frontmatter-Dict mit chunking_profile Override
|
|
|
|
Returns:
|
|
Dict mit Chunking-Konfiguration
|
|
"""
|
|
full_config = load_yaml_config()
|
|
profiles = full_config.get("chunking_profiles", {})
|
|
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
|
|
|
# WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
|
|
profile_name = None
|
|
if frontmatter and "chunking_profile" in frontmatter:
|
|
profile_name = frontmatter.get("chunking_profile") or frontmatter.get("chunk_profile")
|
|
if not profile_name:
|
|
profile_name = type_def.get("chunking_profile")
|
|
if not profile_name:
|
|
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
|
|
|
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
|
if "overlap" in config and isinstance(config["overlap"], list):
|
|
config["overlap"] = tuple(config["overlap"])
|
|
return config
|
|
|
|
def estimate_tokens(text: str) -> int:
|
|
"""Grobe Schätzung der Token-Anzahl."""
|
|
return max(1, math.ceil(len(text.strip()) / 4))
|
|
|
|
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
|
|
"""Trennt YAML-Frontmatter vom Text."""
|
|
import re
|
|
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
|
|
if not fm_match: return {}, md_text
|
|
try:
|
|
frontmatter = yaml.safe_load(fm_match.group(1))
|
|
if not isinstance(frontmatter, dict): frontmatter = {}
|
|
except Exception: frontmatter = {}
|
|
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
|
|
return frontmatter, text_without_fm.strip() |