mitai-jinkendo/backend/data_layer/correlations.py

"""
Correlation Metrics Data Layer

Provides structured correlation analysis and plateau detection functions.

Functions:
    - calculate_lag_correlation(): Lag correlation between variables
    - calculate_correlation_sleep_recovery(): Sleep-recovery correlation
    - calculate_plateau_detected(): Plateau detection (weight, strength, endurance)
    - calculate_top_drivers(): Top drivers for current goals
    - calculate_correlation_confidence(): Confidence level for correlations

All functions return structured data (dict) or simple values.
Use placeholder_resolver.py for formatted strings for AI.

Phase 0c: Multi-Layer Architecture
Version: 1.0
"""

from typing import Dict, List, Optional, Tuple
from datetime import datetime, timedelta, date
from db import get_db, get_cursor, r2d
import statistics

def calculate_lag_correlation(profile_id: str, var1: str, var2: str, max_lag_days: int = 14) -> Optional[Dict]:
    """
    Calculate lagged correlation between two variables

    Args:
        var1: 'energy', 'protein', 'training_load'
        var2: 'weight', 'lbm', 'hrv', 'rhr'
        max_lag_days: Maximum lag to test

    Returns:
        {
            'best_lag': X,  # days
            'correlation': 0.XX,  # -1 to 1
            'direction': 'positive'/'negative'/'none',
            'confidence': 'high'/'medium'/'low',
            'data_points': N
        }
    """
    v1 = (var1 or "").strip().lower()
    if v1 in ("energy", "energy_balance"):
        v1n = "energy"
    elif v1 in ("training_load", "load"):
        v1n = "training_load"
    elif v1 == "protein":
        v1n = "protein"
    else:
        v1n = v1

    if v1n == 'energy' and var2 == 'weight':
        return _normalize_lag_payload(_correlate_energy_weight(profile_id, max_lag_days))
    elif v1n == 'protein' and var2 == 'lbm':
        return _normalize_lag_payload(_correlate_protein_lbm(profile_id, max_lag_days))
    elif v1n == 'training_load' and var2 in ['hrv', 'rhr']:
        return _normalize_lag_payload(_correlate_load_vitals(profile_id, var2, max_lag_days))
    else:
        return None


def _normalize_lag_payload(raw: Optional[Dict]) -> Optional[Dict]:
    """Charts erwarten u. a. ``best_lag_days``; Layer liefert teils ``best_lag``."""
    if not raw:
        return None
    out = dict(raw)
    if out.get("best_lag_days") is None and out.get("best_lag") is not None:
        out["best_lag_days"] = out["best_lag"]
    return out


def _correlate_energy_weight(profile_id: str, max_lag: int) -> Optional[Dict]:
    """
    Correlate energy balance with weight change
    Test lags: 0, 3, 7, 10, 14 days
    """
    with get_db() as conn:
        cur = get_cursor(conn)

        # Get energy balance data (daily calories - estimated TDEE)
        cur.execute("""
            SELECT n.date, n.kcal, w.weight
            FROM nutrition_log n
            LEFT JOIN weight_log w ON w.profile_id = n.profile_id
                AND w.date = n.date
            WHERE n.profile_id = %s
              AND n.date >= CURRENT_DATE - INTERVAL '90 days'
            ORDER BY n.date
        """, (profile_id,))

        data = cur.fetchall()

        if len(data) < 30:
            return {
                'best_lag': None,
                'correlation': None,
                'direction': 'none',
                'confidence': 'low',
                'data_points': len(data),
                'reason': 'Insufficient data (<30 days)'
            }

    # Calculate 7d rolling energy balance
    # (Simplified - actual implementation would need TDEE estimation)

    # For now, return placeholder
    return {
        'best_lag': 7,
        'correlation': -0.45,  # Placeholder
        'direction': 'negative',  # Higher deficit = lower weight (expected)
        'confidence': 'medium',
        'data_points': len(data)
    }


def _correlate_protein_lbm(profile_id: str, max_lag: int) -> Optional[Dict]:
    """Correlate protein intake with LBM trend"""
    # TODO: Implement full correlation calculation
    return {
        'best_lag': 0,
        'correlation': 0.32,  # Placeholder
        'direction': 'positive',
        'confidence': 'medium',
        'data_points': 28
    }


def _correlate_load_vitals(profile_id: str, vital: str, max_lag: int) -> Optional[Dict]:
    """
    Correlate training load with HRV or RHR
    Test lags: 1, 2, 3 days
    """
    # TODO: Implement full correlation calculation
    if vital == 'hrv':
        return {
            'best_lag': 1,
            'correlation': -0.38,  # Negative = high load reduces HRV (expected)
            'direction': 'negative',
            'confidence': 'medium',
            'data_points': 25
        }
    else:  # rhr
        return {
            'best_lag': 1,
            'correlation': 0.42,  # Positive = high load increases RHR (expected)
            'direction': 'positive',
            'confidence': 'medium',
            'data_points': 25
        }


# ============================================================================
# C4: Sleep vs. Recovery Correlation
# ============================================================================

def calculate_correlation_sleep_recovery(profile_id: str) -> Optional[Dict]:
    """
    Correlate sleep quality/duration with recovery score
    """
    # TODO: Implement full correlation
    return {
        'correlation': 0.65,  # Strong positive (expected)
        'direction': 'positive',
        'confidence': 'high',
        'data_points': 28
    }


# ============================================================================
# C6: Plateau Detector
# ============================================================================

def calculate_plateau_detected(profile_id: str) -> Optional[Dict]:
    """
    Detect if user is in a plateau based on goal mode

    Returns:
        {
            'plateau_detected': True/False,
            'plateau_type': 'weight_loss'/'strength'/'endurance'/None,
            'confidence': 'high'/'medium'/'low',
            'duration_days': X,
            'top_factors': [list of potential causes]
        }
    """
    from data_layer.scores import get_user_focus_weights

    focus_weights = get_user_focus_weights(profile_id)

    if not focus_weights:
        return None

    # Determine primary focus area
    top_focus = max(focus_weights, key=focus_weights.get)

    # Check for plateau based on focus area
    if top_focus in ['körpergewicht', 'körperfett']:
        return _detect_weight_plateau(profile_id)
    elif top_focus == 'kraftaufbau':
        return _detect_strength_plateau(profile_id)
    elif top_focus == 'cardio':
        return _detect_endurance_plateau(profile_id)
    else:
        return None


def _detect_weight_plateau(profile_id: str) -> Dict:
    """Detect weight loss plateau"""
    from data_layer.body_metrics import calculate_weight_28d_slope
    from data_layer.nutrition_metrics import calculate_nutrition_score

    slope = calculate_weight_28d_slope(profile_id)
    nutrition_score = calculate_nutrition_score(profile_id)

    if slope is None:
        return {'plateau_detected': False, 'reason': 'Insufficient data'}

    # Plateau = flat weight for 28 days despite adherence
    is_plateau = abs(slope) < 0.02 and nutrition_score and nutrition_score > 70

    if is_plateau:
        factors = []

        # Check potential factors
        if nutrition_score > 85:
            factors.append('Hohe Adhärenz trotz Stagnation → mögliche Anpassung des Stoffwechsels')

        # Check if deficit is too small
        from data_layer.nutrition_metrics import calculate_energy_balance_7d
        balance = calculate_energy_balance_7d(profile_id)
        if balance and balance > -200:
            factors.append('Energiedefizit zu gering (<200 kcal/Tag)')

        # Check water retention (if waist is shrinking but weight stable)
        from data_layer.body_metrics import calculate_waist_28d_delta
        waist_delta = calculate_waist_28d_delta(profile_id)
        if waist_delta and waist_delta < -1:
            factors.append('Taillenumfang sinkt → mögliche Wasserretention maskiert Fettabbau')

        return {
            'plateau_detected': True,
            'plateau_type': 'weight_loss',
            'confidence': 'high' if len(factors) >= 2 else 'medium',
            'duration_days': 28,
            'top_factors': factors[:3]
        }
    else:
        return {'plateau_detected': False}


def _detect_strength_plateau(profile_id: str) -> Dict:
    """Detect strength training plateau"""
    from data_layer.body_metrics import calculate_lbm_28d_change
    from data_layer.activity_metrics import calculate_activity_score
    from data_layer.recovery_metrics import calculate_recovery_score_v2

    lbm_change = calculate_lbm_28d_change(profile_id)
    activity_score = calculate_activity_score(profile_id)
    recovery_score = calculate_recovery_score_v2(profile_id)

    if lbm_change is None:
        return {'plateau_detected': False, 'reason': 'Insufficient data'}

    # Plateau = flat LBM despite high activity score
    is_plateau = abs(lbm_change) < 0.3 and activity_score and activity_score > 75

    if is_plateau:
        factors = []

        if recovery_score and recovery_score < 60:
            factors.append('Recovery Score niedrig → möglicherweise Übertraining')

        from data_layer.nutrition_metrics import calculate_protein_adequacy_28d
        protein_score = calculate_protein_adequacy_28d(profile_id)
        if protein_score and protein_score < 70:
            factors.append('Proteinzufuhr unter Zielbereich')

        from data_layer.activity_metrics import calculate_monotony_score
        monotony = calculate_monotony_score(profile_id)
        if monotony and monotony > 2.0:
            factors.append('Hohe Trainingsmonotonie → Stimulus-Anpassung')

        return {
            'plateau_detected': True,
            'plateau_type': 'strength',
            'confidence': 'medium',
            'duration_days': 28,
            'top_factors': factors[:3]
        }
    else:
        return {'plateau_detected': False}


def _detect_endurance_plateau(profile_id: str) -> Dict:
    """Detect endurance plateau"""
    from data_layer.activity_metrics import calculate_training_minutes_week, calculate_monotony_score
    from data_layer.recovery_metrics import calculate_vo2max_trend_28d

    # TODO: Implement when vitals_baseline.vo2_max is populated
    return {'plateau_detected': False, 'reason': 'VO2max tracking not yet implemented'}


# ============================================================================
# C7: Multi-Factor Driver Panel
# ============================================================================

def calculate_top_drivers(profile_id: str) -> Optional[List[Dict]]:
    """
    Calculate top influencing factors for goal progress

    Returns list of drivers:
    [
        {
            'factor': 'Energiebilanz',
            'status': 'förderlich'/'neutral'/'hinderlich',
            'evidence': 'hoch'/'mittel'/'niedrig',
            'reason': '1-sentence explanation'
        },
        ...
    ]
    """
    drivers = []

    # 1. Energy balance
    from data_layer.nutrition_metrics import calculate_energy_balance_7d
    balance = calculate_energy_balance_7d(profile_id)
    if balance is not None:
        if -500 <= balance <= -200:
            status = 'förderlich'
            reason = f'Moderates Defizit ({int(balance)} kcal/Tag) unterstützt Fettabbau'
        elif balance < -800:
            status = 'hinderlich'
            reason = f'Sehr großes Defizit ({int(balance)} kcal/Tag) → Risiko für Magermasseverlust'
        elif -200 < balance < 200:
            status = 'neutral'
            reason = 'Energiebilanz ausgeglichen'
        else:
            status = 'neutral'
            reason = f'Energieüberschuss ({int(balance)} kcal/Tag)'

        drivers.append({
            'factor': 'Energiebilanz',
            'status': status,
            'evidence': 'hoch',
            'reason': reason
        })

    # 2. Protein adequacy
    from data_layer.nutrition_metrics import calculate_protein_adequacy_28d
    protein_score = calculate_protein_adequacy_28d(profile_id)
    if protein_score is not None:
        if protein_score >= 80:
            status = 'förderlich'
            reason = f'Proteinzufuhr konstant im Zielbereich (Score: {protein_score})'
        elif protein_score >= 60:
            status = 'neutral'
            reason = f'Proteinzufuhr teilweise im Zielbereich (Score: {protein_score})'
        else:
            status = 'hinderlich'
            reason = f'Proteinzufuhr häufig unter Zielbereich (Score: {protein_score})'

        drivers.append({
            'factor': 'Proteinzufuhr',
            'status': status,
            'evidence': 'hoch',
            'reason': reason
        })

    # 3. Sleep duration
    from data_layer.recovery_metrics import calculate_sleep_avg_duration_7d
    sleep_hours = calculate_sleep_avg_duration_7d(profile_id)
    if sleep_hours is not None:
        if sleep_hours >= 7:
            status = 'förderlich'
            reason = f'Schlafdauer ausreichend ({sleep_hours:.1f}h/Nacht)'
        elif sleep_hours >= 6.5:
            status = 'neutral'
            reason = f'Schlafdauer knapp ausreichend ({sleep_hours:.1f}h/Nacht)'
        else:
            status = 'hinderlich'
            reason = f'Schlafdauer zu gering ({sleep_hours:.1f}h/Nacht < 7h Empfehlung)'

        drivers.append({
            'factor': 'Schlafdauer',
            'status': status,
            'evidence': 'hoch',
            'reason': reason
        })

    # 4. Sleep regularity
    from data_layer.recovery_metrics import calculate_sleep_regularity_proxy
    regularity = calculate_sleep_regularity_proxy(profile_id)
    if regularity is not None:
        if regularity <= 45:
            status = 'förderlich'
            reason = f'Schlafrhythmus regelmäßig (Abweichung: {int(regularity)} min)'
        elif regularity <= 75:
            status = 'neutral'
            reason = f'Schlafrhythmus moderat variabel (Abweichung: {int(regularity)} min)'
        else:
            status = 'hinderlich'
            reason = f'Schlafrhythmus stark variabel (Abweichung: {int(regularity)} min)'

        drivers.append({
            'factor': 'Schlafregelmäßigkeit',
            'status': status,
            'evidence': 'mittel',
            'reason': reason
        })

    # 5. Training consistency
    from data_layer.activity_metrics import calculate_training_frequency_7d
    frequency = calculate_training_frequency_7d(profile_id)
    if frequency is not None:
        if 3 <= frequency <= 6:
            status = 'förderlich'
            reason = f'Trainingsfrequenz im Zielbereich ({frequency}× pro Woche)'
        elif frequency <= 2:
            status = 'hinderlich'
            reason = f'Trainingsfrequenz zu niedrig ({frequency}× pro Woche)'
        else:
            status = 'neutral'
            reason = f'Trainingsfrequenz sehr hoch ({frequency}× pro Woche) → Recovery beachten'

        drivers.append({
            'factor': 'Trainingskonsistenz',
            'status': status,
            'evidence': 'hoch',
            'reason': reason
        })

    # 6. Quality sessions
    from data_layer.activity_metrics import calculate_quality_sessions_pct
    quality_pct = calculate_quality_sessions_pct(profile_id)
    if quality_pct is not None:
        if quality_pct >= 75:
            status = 'förderlich'
            reason = f'{quality_pct}% der Trainings mit guter Qualität'
        elif quality_pct >= 50:
            status = 'neutral'
            reason = f'{quality_pct}% der Trainings mit guter Qualität'
        else:
            status = 'hinderlich'
            reason = f'Nur {quality_pct}% der Trainings mit guter Qualität'

        drivers.append({
            'factor': 'Trainingsqualität',
            'status': status,
            'evidence': 'mittel',
            'reason': reason
        })

    # 7. Recovery score
    from data_layer.recovery_metrics import calculate_recovery_score_v2
    recovery = calculate_recovery_score_v2(profile_id)
    if recovery is not None:
        if recovery >= 70:
            status = 'förderlich'
            reason = f'Recovery Score gut ({recovery}/100)'
        elif recovery >= 50:
            status = 'neutral'
            reason = f'Recovery Score moderat ({recovery}/100)'
        else:
            status = 'hinderlich'
            reason = f'Recovery Score niedrig ({recovery}/100) → mehr Erholung nötig'

        drivers.append({
            'factor': 'Recovery',
            'status': status,
            'evidence': 'hoch',
            'reason': reason
        })

    # 8. Rest day compliance
    from data_layer.activity_metrics import calculate_rest_day_compliance
    compliance = calculate_rest_day_compliance(profile_id)
    if compliance is not None:
        if compliance >= 80:
            status = 'förderlich'
            reason = f'Ruhetage gut eingehalten ({compliance}%)'
        elif compliance >= 60:
            status = 'neutral'
            reason = f'Ruhetage teilweise eingehalten ({compliance}%)'
        else:
            status = 'hinderlich'
            reason = f'Ruhetage häufig ignoriert ({compliance}%) → Übertrainingsrisiko'

        drivers.append({
            'factor': 'Ruhetagsrespekt',
            'status': status,
            'evidence': 'mittel',
            'reason': reason
        })

    # Sort by importance: hinderlich first, then förderlich, then neutral
    priority = {'hinderlich': 0, 'förderlich': 1, 'neutral': 2}
    drivers.sort(key=lambda d: priority[d['status']])

    return drivers[:8]  # Top 8 drivers


# ============================================================================
# Confidence/Evidence Levels
# ============================================================================

def calculate_correlation_confidence(data_points: int, correlation: float) -> str:
    """
    Determine confidence level for correlation

    Returns: 'high', 'medium', or 'low'
    """
    # Need sufficient data points
    if data_points < 20:
        return 'low'

    # Strong correlation with good data
    if data_points >= 40 and abs(correlation) >= 0.5:
        return 'high'
    elif data_points >= 30 and abs(correlation) >= 0.4:
        return 'medium'
    else:
        return 'low'