From 3106ebedae37a8545c35fe93bb1067ae934d64e3 Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 21 Apr 2026 08:03:43 +0200 Subject: [PATCH] feat: enhance lag correlation calculations and chart metadata - Updated `calculate_lag_correlation` to include detailed interpretations and lag details for energy balance vs. weight change, protein vs. lean mass, and load vs. vital metrics. - Improved handling of insufficient data scenarios in correlation charts, providing clearer messages and metadata for user insights. - Refactored chart functions to utilize best lag values and correlation data more effectively, enhancing the visualization of relationships between metrics. --- backend/data_layer/correlations.py | 401 ++++++++++++++++++++++++----- backend/routers/charts.py | 100 ++++--- 2 files changed, 400 insertions(+), 101 deletions(-) diff --git a/backend/data_layer/correlations.py b/backend/data_layer/correlations.py index 5ab2ac2..a0ed2fc 100644 --- a/backend/data_layer/correlations.py +++ b/backend/data_layer/correlations.py @@ -17,28 +17,29 @@ Phase 0c: Multi-Layer Architecture Version: 1.0 """ -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple + from datetime import datetime, timedelta, date from db import get_db, get_cursor, r2d import statistics +from data_layer.nutrition_body_merge import build_merged_daily_nutrition_body_rows +from data_layer.nutrition_metrics import estimate_tdee_kcal_from_latest_weight + +# Lag-Korrelation (Issue #53): gleiche TDEE-Logik wie nutrition_metrics / nutrition_viz +MIN_PAIRS_LAG_CORR = 15 +LAG_CORR_LOOKBACK_DAYS = 120 + def calculate_lag_correlation(profile_id: str, var1: str, var2: str, max_lag_days: int = 14) -> Optional[Dict]: """ - Calculate lagged correlation between two variables + Pearson-Korrelation mit Lag-Sweep (Issue 53, Data-Layer). - Args: - var1: 'energy', 'protein', 'training_load' - var2: 'weight', 'lbm', 'hrv', 'rhr' - max_lag_days: Maximum lag to test + C1: Tagesbilanz (kcal − TDEE wie ``estimate_tdee_kcal_from_latest_weight``) vs. ΔGewicht [t→t+L], L≥1. + C2: Protein (g) vs. ΔMager [t→t+L] aus ``build_merged_daily_nutrition_body_rows``, L≥1. + C3: Summe ``duration_min`` pro Tag vs. HRV oder Ruhepuls am Tag t+L (L≥0). - Returns: - { - 'best_lag': X, # days - 'correlation': 0.XX, # -1 to 1 - 'direction': 'positive'/'negative'/'none', - 'confidence': 'high'/'medium'/'low', - 'data_points': N - } + Rückgabe enthält u. a. ``best_lag`` / ``best_lag_days``, ``correlation``, ``interpretation``, + optional ``lag_details`` (r, n je Lag), mindestens ``MIN_PAIRS_LAG_CORR`` Paare am besten Lag. """ v1 = (var1 or "").strip().lower() if v1 in ("energy", "energy_balance"): @@ -70,85 +71,349 @@ def _normalize_lag_payload(raw: Optional[Dict]) -> Optional[Dict]: return out +def _iso_date_key(d: Any) -> str: + if d is None: + return "" + if hasattr(d, "isoformat"): + return str(d.isoformat())[:10] + s = str(d) + return s[:10] if len(s) >= 10 else s + + +def _parse_iso_to_date(ds: str) -> Optional[date]: + if not ds or len(ds) < 10: + return None + try: + return date.fromisoformat(ds[:10]) + except ValueError: + return None + + +def _pearson_r(xs: List[float], ys: List[float]) -> Optional[float]: + """Pearson-Korrelation; mindestens ``MIN_PAIRS_LAG_CORR`` Paare.""" + n = len(xs) + if n < MIN_PAIRS_LAG_CORR or n != len(ys): + return None + mx = sum(xs) / n + my = sum(ys) / n + num = sum((xs[i] - mx) * (ys[i] - my) for i in range(n)) + dx = sum((xs[i] - mx) ** 2 for i in range(n)) + dy = sum((ys[i] - my) ** 2 for i in range(n)) + if dx <= 1e-12 or dy <= 1e-12: + return None + r = num / ((dx**0.5) * (dy**0.5)) + return float(max(-1.0, min(1.0, r))) + + +def _direction_from_r(r: float) -> str: + if r > 0.05: + return "positive" + if r < -0.05: + return "negative" + return "none" + + +def _lag_confidence(n_pairs: int, r: float) -> str: + return calculate_correlation_confidence(n_pairs, abs(r)) + + def _correlate_energy_weight(profile_id: str, max_lag: int) -> Optional[Dict]: """ - Correlate energy balance with weight change - Test lags: 0, 3, 7, 10, 14 days + Pearson: Tagesbilanz (kcal − TDEE wie nutrition_metrics) vs. Gewichtsdifferenz + vom Tag t zu Tag t+L (L = 0 … max_lag). Bestes Lag nach maximalem |r|. """ + tdee = estimate_tdee_kcal_from_latest_weight(profile_id) + if tdee is None or float(tdee) <= 0: + return { + "best_lag": None, + "correlation": None, + "direction": "none", + "confidence": "insufficient", + "data_points": 0, + "interpretation": "Keine TDEE-Schätzung möglich (Gewicht/Demografie).", + "reason": "no_tdee", + } + + tdee_f = float(tdee) + cutoff = (datetime.now() - timedelta(days=LAG_CORR_LOOKBACK_DAYS)).strftime("%Y-%m-%d") + with get_db() as conn: cur = get_cursor(conn) + cur.execute( + """ + SELECT date::date AS d, SUM(kcal)::float AS kcal + FROM nutrition_log + WHERE profile_id = %s AND date >= %s::date AND kcal IS NOT NULL + GROUP BY date + ORDER BY date + """, + (profile_id, cutoff), + ) + kcal_rows = cur.fetchall() + cur.execute( + """ + SELECT date::date AS d, weight::float AS weight + FROM weight_log + WHERE profile_id = %s AND date >= %s::date AND weight IS NOT NULL + ORDER BY date + """, + (profile_id, cutoff), + ) + w_rows = cur.fetchall() - # Get energy balance data (daily calories - estimated TDEE) - cur.execute(""" - SELECT n.date, n.kcal, w.weight - FROM nutrition_log n - LEFT JOIN weight_log w ON w.profile_id = n.profile_id - AND w.date = n.date - WHERE n.profile_id = %s - AND n.date >= CURRENT_DATE - INTERVAL '90 days' - ORDER BY n.date - """, (profile_id,)) + kcal_by: Dict[str, float] = {} + for r in kcal_rows: + kcal_by[_iso_date_key(r["d"])] = float(r["kcal"] or 0) + weight_by: Dict[str, float] = {} + for r in w_rows: + weight_by[_iso_date_key(r["d"])] = float(r["weight"]) - data = cur.fetchall() + balance_by = {d: kcal_by[d] - tdee_f for d in kcal_by} - if len(data) < 30: - return { - 'best_lag': None, - 'correlation': None, - 'direction': 'none', - 'confidence': 'low', - 'data_points': len(data), - 'reason': 'Insufficient data (<30 days)' - } + best: Optional[Tuple[int, float, int, List[Tuple[int, float]]]] = None + lag_details: List[Dict[str, Any]] = [] - # Calculate 7d rolling energy balance - # (Simplified - actual implementation would need TDEE estimation) + max_l = max(0, min(int(max_lag), 28)) + # Lag 0: ΔGewicht am selben Tag ist immer 0 → sinnvoll erst ab Tag 1 + for lag in range(1, max_l + 1): + xs: List[float] = [] + ys: List[float] = [] + for ds in sorted(balance_by.keys()): + d0 = _parse_iso_to_date(ds) + if d0 is None: + continue + d1 = d0 + timedelta(days=lag) + ds1 = d1.isoformat() + w0 = weight_by.get(ds) + w1 = weight_by.get(ds1) + if w0 is None or w1 is None: + continue + xs.append(balance_by[ds]) + ys.append(w1 - w0) + r = _pearson_r(xs, ys) + n_p = len(xs) + lag_details.append({"lag": lag, "n_pairs": n_p, "r": None if r is None else round(r, 4)}) + if r is None: + continue + if best is None or abs(r) > abs(best[1]): + best = (lag, r, n_p) + + if best is None: + return { + "best_lag": None, + "correlation": None, + "direction": "none", + "confidence": "insufficient", + "data_points": 0, + "interpretation": "Zu wenige gepaarte Tage mit Ernährung, Gewicht und gewähltem Lag.", + "reason": "insufficient_pairs", + "lag_details": lag_details, + "tdee_kcal_used": round(tdee_f, 0), + } + + lag_b, r_b, n_b, _ = best + direction = _direction_from_r(r_b) + conf = _lag_confidence(n_b, r_b) + interp = ( + f"Tagesbilanz (kcal − TDEE ~{tdee_f:.0f}) vs. Gewichtsänderung nach {lag_b} Tagen: " + f"r ≈ {r_b:.2f} ({direction}). " + f"Basierend auf {n_b} Kalendertagen mit vollständigen Paaren." + ) - # For now, return placeholder return { - 'best_lag': 7, - 'correlation': -0.45, # Placeholder - 'direction': 'negative', # Higher deficit = lower weight (expected) - 'confidence': 'medium', - 'data_points': len(data) + "best_lag": lag_b, + "correlation": round(r_b, 4), + "direction": direction, + "confidence": conf, + "data_points": n_b, + "interpretation": interp, + "lag_details": lag_details, + "tdee_kcal_used": round(tdee_f, 0), } def _correlate_protein_lbm(profile_id: str, max_lag: int) -> Optional[Dict]: - """Correlate protein intake with LBM trend""" - # TODO: Implement full correlation calculation + """ + Pearson: Protein (g/Tag) vs. Magermasse-Differenz (kg) vom Tag t zu t+L. + Datenbasis: nutrition_body_merge (Caliper-LBM forward-filled wie Ernährungs-Verlauf). + """ + merged = build_merged_daily_nutrition_body_rows(profile_id) + if not merged: + return { + "best_lag": None, + "correlation": None, + "direction": "none", + "confidence": "insufficient", + "data_points": 0, + "interpretation": "Keine zusammengeführten Ernährungs-/Körperdaten.", + "reason": "no_merged_rows", + } + + protein_by: Dict[str, float] = {} + lbm_by: Dict[str, float] = {} + for row in merged: + ds = _iso_date_key(row.get("date")) + if not ds: + continue + pg = row.get("protein_g") + lm = row.get("lean_mass") + if pg is not None: + protein_by[ds] = float(pg) + if lm is not None: + lbm_by[ds] = float(lm) + + best: Optional[Tuple[int, float, int]] = None + lag_details: List[Dict[str, Any]] = [] + max_l = max(0, min(int(max_lag), 28)) + + for lag in range(1, max_l + 1): + xs: List[float] = [] + ys: List[float] = [] + for ds in sorted(protein_by.keys()): + if ds not in lbm_by: + continue + d0 = _parse_iso_to_date(ds) + if d0 is None: + continue + d1 = d0 + timedelta(days=lag) + ds1 = d1.isoformat() + if ds1 not in lbm_by: + continue + xs.append(protein_by[ds]) + ys.append(lbm_by[ds1] - lbm_by[ds]) + r = _pearson_r(xs, ys) + n_p = len(xs) + lag_details.append({"lag": lag, "n_pairs": n_p, "r": None if r is None else round(r, 4)}) + if r is None: + continue + if best is None or abs(r) > abs(best[1]): + best = (lag, r, n_p) + + if best is None: + return { + "best_lag": None, + "correlation": None, + "direction": "none", + "confidence": "insufficient", + "data_points": 0, + "interpretation": "Zu wenige Tage mit Protein und Magermasse (Caliper) für die gewählten Lags.", + "reason": "insufficient_pairs", + "lag_details": lag_details, + } + + lag_b, r_b, n_b = best + direction = _direction_from_r(r_b) + conf = _lag_confidence(n_b, r_b) + interp = ( + f"Protein (g/Tag) vs. Magermasse-Änderung nach {lag_b} Tagen: r ≈ {r_b:.2f} ({direction}). " + f"{n_b} gepaarte Tage." + ) + return { - 'best_lag': 0, - 'correlation': 0.32, # Placeholder - 'direction': 'positive', - 'confidence': 'medium', - 'data_points': 28 + "best_lag": lag_b, + "correlation": round(r_b, 4), + "direction": direction, + "confidence": conf, + "data_points": n_b, + "interpretation": interp, + "lag_details": lag_details, } def _correlate_load_vitals(profile_id: str, vital: str, max_lag: int) -> Optional[Dict]: """ - Correlate training load with HRV or RHR - Test lags: 1, 2, 3 days + Pearson: Tages-Trainingslast (Summe duration_min) vs. Vitals (HRV ms oder Ruhepuls) + am Kalendertag t+Lag (typisch: Belastung am Vortag, Vitalwert am Folgetag bei Lag ≥ 1). """ - # TODO: Implement full correlation calculation - if vital == 'hrv': + col = "hrv" if vital == "hrv" else "resting_hr" + cutoff = (datetime.now() - timedelta(days=LAG_CORR_LOOKBACK_DAYS)).strftime("%Y-%m-%d") + + with get_db() as conn: + cur = get_cursor(conn) + cur.execute( + """ + SELECT date::text AS d, COALESCE(SUM(duration_min), 0)::float AS minutes + FROM activity_log + WHERE profile_id = %s AND date >= %s::date + AND duration_min IS NOT NULL AND duration_min > 0 + GROUP BY date + ORDER BY date + """, + (profile_id, cutoff), + ) + load_rows = cur.fetchall() + cur.execute( + f""" + SELECT date::text AS d, {col}::float AS v + FROM vitals_baseline + WHERE profile_id = %s AND date >= %s::date AND {col} IS NOT NULL + ORDER BY date + """, + (profile_id, cutoff), + ) + vit_rows = cur.fetchall() + + load_by = {str(r["d"])[:10]: float(r["minutes"] or 0) for r in load_rows} + vital_by = {str(r["d"])[:10]: float(r["v"]) for r in vit_rows} + + best: Optional[Tuple[int, float, int]] = None + lag_details: List[Dict[str, Any]] = [] + max_l = max(0, min(int(max_lag), 28)) + vlabel = "HRV (ms)" if vital == "hrv" else "Ruhepuls (bpm)" + + for lag in range(0, max_l + 1): + xs: List[float] = [] + ys: List[float] = [] + for ds in sorted(load_by.keys()): + d0 = _parse_iso_to_date(ds) + if d0 is None: + continue + d1 = d0 + timedelta(days=lag) + ds1 = d1.isoformat() + if ds1 not in vital_by: + continue + xs.append(load_by[ds]) + ys.append(vital_by[ds1]) + r = _pearson_r(xs, ys) + n_p = len(xs) + lag_details.append({"lag": lag, "n_pairs": n_p, "r": None if r is None else round(r, 4)}) + if r is None: + continue + if best is None or abs(r) > abs(best[1]): + best = (lag, r, n_p) + + if best is None: return { - 'best_lag': 1, - 'correlation': -0.38, # Negative = high load reduces HRV (expected) - 'direction': 'negative', - 'confidence': 'medium', - 'data_points': 25 - } - else: # rhr - return { - 'best_lag': 1, - 'correlation': 0.42, # Positive = high load increases RHR (expected) - 'direction': 'positive', - 'confidence': 'medium', - 'data_points': 25 + "best_lag": None, + "correlation": None, + "direction": "none", + "confidence": "insufficient", + "data_points": 0, + "interpretation": f"Zu wenige gepaarte Tage mit Training und {vlabel}.", + "reason": "insufficient_pairs", + "lag_details": lag_details, + "vital": vital, } + lag_b, r_b, n_b = best + direction = _direction_from_r(r_b) + conf = _lag_confidence(n_b, r_b) + interp = ( + f"Trainingsminuten/Tag vs. {vlabel} nach {lag_b} Tagen Lag: r ≈ {r_b:.2f} ({direction}). " + f"{n_b} Paare." + ) + + return { + "best_lag": lag_b, + "correlation": round(r_b, 4), + "direction": direction, + "confidence": conf, + "data_points": n_b, + "interpretation": interp, + "lag_details": lag_details, + "vital": vital, + } + # ============================================================================ # C4: Sleep vs. Recovery Correlation diff --git a/backend/routers/charts.py b/backend/routers/charts.py index 8578beb..220f8c0 100644 --- a/backend/routers/charts.py +++ b/backend/routers/charts.py @@ -1115,6 +1115,9 @@ def get_weight_energy_correlation_chart( corr_data = calculate_lag_correlation(profile_id, "energy_balance", "weight", max_lag) if not corr_data or corr_data.get('correlation') is None: + msg = "Nicht genug Daten für Korrelationsanalyse" + if isinstance(corr_data, dict): + msg = str(corr_data.get("interpretation") or corr_data.get("reason") or msg) return { "chart_type": "scatter", "data": { @@ -1123,14 +1126,15 @@ def get_weight_energy_correlation_chart( }, "metadata": { "confidence": "insufficient", - "data_points": 0, - "message": "Nicht genug Daten für Korrelationsanalyse" + "data_points": corr_data.get("data_points", 0) if isinstance(corr_data, dict) else 0, + "message": msg, + "lag_details": corr_data.get("lag_details") if isinstance(corr_data, dict) else None, + "tdee_kcal_used": corr_data.get("tdee_kcal_used") if isinstance(corr_data, dict) else None, } } - # Create lag vs correlation data for chart - # For simplicity, show best lag point as single data point - best_lag = corr_data.get('best_lag_days', 0) + # Ein Punkt: bestes Lag (max. |r|) — Berechnung in data_layer.correlations (Issue 53) + best_lag = corr_data.get('best_lag_days', corr_data.get('best_lag', 0)) correlation = corr_data.get('correlation', 0) return { @@ -1150,10 +1154,13 @@ def get_weight_energy_correlation_chart( }, "metadata": { "confidence": corr_data.get('confidence', 'low'), - "correlation": round(correlation, 3), + "correlation": round(float(correlation), 3), "best_lag_days": best_lag, "interpretation": corr_data.get('interpretation', ''), - "data_points": corr_data.get('data_points', 0) + "data_points": corr_data.get('data_points', 0), + "lag_details": corr_data.get("lag_details"), + "tdee_kcal_used": corr_data.get("tdee_kcal_used"), + "layer_1": "correlations._correlate_energy_weight", } } @@ -1180,6 +1187,9 @@ def get_lbm_protein_correlation_chart( corr_data = calculate_lag_correlation(profile_id, "protein", "lbm", max_lag) if not corr_data or corr_data.get('correlation') is None: + msg = "Nicht genug Daten für LBM-Protein Korrelation" + if isinstance(corr_data, dict): + msg = str(corr_data.get("interpretation") or corr_data.get("reason") or msg) return { "chart_type": "scatter", "data": { @@ -1188,12 +1198,13 @@ def get_lbm_protein_correlation_chart( }, "metadata": { "confidence": "insufficient", - "data_points": 0, - "message": "Nicht genug Daten für LBM-Protein Korrelation" + "data_points": corr_data.get("data_points", 0) if isinstance(corr_data, dict) else 0, + "message": msg, + "lag_details": corr_data.get("lag_details") if isinstance(corr_data, dict) else None, } } - best_lag = corr_data.get('best_lag_days', 0) + best_lag = corr_data.get('best_lag_days', corr_data.get('best_lag', 0)) correlation = corr_data.get('correlation', 0) return { @@ -1213,10 +1224,12 @@ def get_lbm_protein_correlation_chart( }, "metadata": { "confidence": corr_data.get('confidence', 'low'), - "correlation": round(correlation, 3), + "correlation": round(float(correlation), 3), "best_lag_days": best_lag, "interpretation": corr_data.get('interpretation', ''), - "data_points": corr_data.get('data_points', 0) + "data_points": corr_data.get('data_points', 0), + "lag_details": corr_data.get("lag_details"), + "layer_1": "correlations._correlate_protein_lbm", } } @@ -1240,35 +1253,54 @@ def get_load_vitals_correlation_chart( """ profile_id = session['profile_id'] - # Try HRV first corr_hrv = calculate_lag_correlation(profile_id, "load", "hrv", max_lag) corr_rhr = calculate_lag_correlation(profile_id, "load", "rhr", max_lag) - # Use whichever has stronger correlation - if corr_hrv and corr_rhr: - corr_data = corr_hrv if abs(corr_hrv.get('correlation', 0)) > abs(corr_rhr.get('correlation', 0)) else corr_rhr - metric_name = "HRV" if corr_data == corr_hrv else "RHR" - elif corr_hrv: - corr_data = corr_hrv - metric_name = "HRV" - elif corr_rhr: - corr_data = corr_rhr - metric_name = "RHR" - else: + def _abs_corr(c): + if not c or c.get("correlation") is None: + return -1.0 + try: + return abs(float(c["correlation"])) + except (TypeError, ValueError): + return -1.0 + + if _abs_corr(corr_hrv) < 0 and _abs_corr(corr_rhr) < 0: + msg = "Nicht genug Daten für Load-Vitals Korrelation" + h_msg = corr_hrv.get("interpretation") if isinstance(corr_hrv, dict) else None + r_msg = corr_rhr.get("interpretation") if isinstance(corr_rhr, dict) else None + if h_msg or r_msg: + msg = f"HRV: {h_msg or '—'} · RHR: {r_msg or '—'}" return { "chart_type": "scatter", - "data": { - "labels": [], - "datasets": [] - }, + "data": {"labels": [], "datasets": []}, "metadata": { "confidence": "insufficient", "data_points": 0, - "message": "Nicht genug Daten für Load-Vitals Korrelation" - } + "message": msg, + "lag_details_hrv": corr_hrv.get("lag_details") if isinstance(corr_hrv, dict) else None, + "lag_details_rhr": corr_rhr.get("lag_details") if isinstance(corr_rhr, dict) else None, + }, } - best_lag = corr_data.get('best_lag_days', 0) + if _abs_corr(corr_hrv) >= _abs_corr(corr_rhr): + corr_data = corr_hrv + metric_name = "HRV" + else: + corr_data = corr_rhr + metric_name = "RHR" + + if not corr_data or corr_data.get("correlation") is None: + return { + "chart_type": "scatter", + "data": {"labels": [], "datasets": []}, + "metadata": { + "confidence": "insufficient", + "data_points": 0, + "message": str(corr_data.get("interpretation") or "Nicht genug Daten für Load-Vitals Korrelation"), + }, + } + + best_lag = corr_data.get('best_lag_days', corr_data.get('best_lag', 0)) correlation = corr_data.get('correlation', 0) return { @@ -1288,11 +1320,13 @@ def get_load_vitals_correlation_chart( }, "metadata": { "confidence": corr_data.get('confidence', 'low'), - "correlation": round(correlation, 3), + "correlation": round(float(correlation), 3), "best_lag_days": best_lag, "metric": metric_name, "interpretation": corr_data.get('interpretation', ''), - "data_points": corr_data.get('data_points', 0) + "data_points": corr_data.get('data_points', 0), + "lag_details": corr_data.get("lag_details"), + "layer_1": "correlations._correlate_load_vitals", } }