mitai-jinkendo/backend/data_layer/utils.py

"""
Data Layer Utilities

Shared helper functions for all data layer modules.

Functions:
    - calculate_confidence(): Determine data quality confidence level
    - serialize_dates(): Convert Python date objects to ISO strings for JSON
    - safe_float(): Safe conversion from Decimal/None to float
    - safe_int(): Safe conversion to int

Phase 0c: Multi-Layer Architecture
Version: 1.0
"""

from typing import Any, Dict, List, Optional
from datetime import date
from decimal import Decimal


def calculate_confidence(
    data_points: int,
    days_requested: int,
    metric_type: str = "general"
) -> str:
    """
    Calculate confidence level based on data availability.

    Args:
        data_points: Number of actual data points available
        days_requested: Number of days in analysis window
        metric_type: Type of metric ("general", "correlation", "trend")

    Returns:
        Confidence level: "high" | "medium" | "low" | "insufficient"

    Confidence Rules:
        General (default):
            - 7d:  high >= 4,  medium >= 3,  low >= 2
            - 28d: high >= 18, medium >= 12, low >= 8
            - 90d: high >= 60, medium >= 40, low >= 30

        Correlation:
            - high >= 28, medium >= 21, low >= 14

        Trend:
            - high >= 70% of days, medium >= 50%, low >= 30%

    Example:
        >>> calculate_confidence(20, 28, "general")
        'high'
        >>> calculate_confidence(10, 28, "general")
        'low'
    """
    if data_points == 0:
        return "insufficient"

    if metric_type == "correlation":
        # Correlation needs more paired data points
        if data_points >= 28:
            return "high"
        elif data_points >= 21:
            return "medium"
        elif data_points >= 14:
            return "low"
        else:
            return "insufficient"

    elif metric_type == "trend":
        # Trend analysis based on percentage of days covered
        coverage = data_points / days_requested if days_requested > 0 else 0

        if coverage >= 0.70:
            return "high"
        elif coverage >= 0.50:
            return "medium"
        elif coverage >= 0.30:
            return "low"
        else:
            return "insufficient"

    else:  # "general"
        # Different thresholds based on time window
        if days_requested <= 7:
            if data_points >= 4:
                return "high"
            elif data_points >= 3:
                return "medium"
            elif data_points >= 2:
                return "low"
            else:
                return "insufficient"

        elif days_requested < 90:
            # 8-89 days: Medium-term analysis
            if data_points >= 18:
                return "high"
            elif data_points >= 12:
                return "medium"
            elif data_points >= 8:
                return "low"
            else:
                return "insufficient"

        else:  # 90+ days: Long-term analysis
            if data_points >= 60:
                return "high"
            elif data_points >= 40:
                return "medium"
            elif data_points >= 30:
                return "low"
            else:
                return "insufficient"


def serialize_dates(data: Any) -> Any:
    """
    Convert Python date objects to ISO strings for JSON serialization.

    Recursively walks through dicts, lists, and tuples converting date objects.

    Args:
        data: Any data structure (dict, list, tuple, or primitive)

    Returns:
        Same structure with dates converted to ISO strings

    Example:
        >>> serialize_dates({"date": date(2026, 3, 28), "value": 85.0})
        {"date": "2026-03-28", "value": 85.0}
    """
    if isinstance(data, dict):
        return {k: serialize_dates(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [serialize_dates(item) for item in data]
    elif isinstance(data, tuple):
        return tuple(serialize_dates(item) for item in data)
    elif isinstance(data, date):
        return data.isoformat()
    else:
        return data


def safe_float(value: Any, default: float = 0.0) -> float:
    """
    Safely convert value to float.

    Handles Decimal, None, and invalid values.

    Args:
        value: Value to convert (can be Decimal, int, float, str, None)
        default: Default value if conversion fails

    Returns:
        Float value or default

    Example:
        >>> safe_float(Decimal('85.5'))
        85.5
        >>> safe_float(None)
        0.0
        >>> safe_float(None, -1.0)
        -1.0
    """
    if value is None:
        return default

    try:
        if isinstance(value, Decimal):
            return float(value)
        return float(value)
    except (ValueError, TypeError):
        return default


def safe_int(value: Any, default: int = 0) -> int:
    """
    Safely convert value to int.

    Handles Decimal, None, and invalid values.

    Args:
        value: Value to convert
        default: Default value if conversion fails

    Returns:
        Int value or default

    Example:
        >>> safe_int(Decimal('42'))
        42
        >>> safe_int(None)
        0
    """
    if value is None:
        return default

    try:
        if isinstance(value, Decimal):
            return int(value)
        return int(value)
    except (ValueError, TypeError):
        return default


def calculate_baseline(
    values: List[float],
    method: str = "median"
) -> float:
    """
    Calculate baseline value from a list of measurements.

    Args:
        values: List of numeric values
        method: "median" (default) | "mean" | "trimmed_mean"

    Returns:
        Baseline value

    Example:
        >>> calculate_baseline([85.0, 84.5, 86.0, 84.8, 85.2])
        85.0
    """
    import statistics

    if not values:
        return 0.0

    if method == "median":
        return statistics.median(values)
    elif method == "mean":
        return statistics.mean(values)
    elif method == "trimmed_mean":
        # Remove top/bottom 10%
        if len(values) < 10:
            return statistics.mean(values)
        sorted_vals = sorted(values)
        trim_count = len(values) // 10
        trimmed = sorted_vals[trim_count:-trim_count] if trim_count > 0 else sorted_vals
        return statistics.mean(trimmed) if trimmed else 0.0
    else:
        return statistics.median(values)  # Default to median