""" Enhanced Placeholder Metadata Extraction Improved extraction logic that addresses quality issues: 1. Correct value_raw extraction 2. Accurate unit inference 3. Precise time_window detection 4. Real source provenance 5. Quality filter policies for activity placeholders """ import re import json from typing import Any, Optional, Tuple, Dict from placeholder_metadata import ( PlaceholderType, TimeWindow, OutputType, QualityFilterPolicy, ConfidenceLogic, ConfidenceLevel ) # ── Enhanced Value Raw Extraction ───────────────────────────────────────────── def extract_value_raw(value_display: str, output_type: OutputType, placeholder_type: PlaceholderType) -> Tuple[Any, bool]: """ Extract raw value from display string. Returns: (raw_value, success) """ if not value_display or value_display in ['nicht verfügbar', 'nicht genug Daten']: return None, True # JSON output type if output_type == OutputType.JSON: try: return json.loads(value_display), True except (json.JSONDecodeError, TypeError): # Try to find JSON in string json_match = re.search(r'(\{.*\}|\[.*\])', value_display, re.DOTALL) if json_match: try: return json.loads(json_match.group(1)), True except: pass return None, False # Markdown output type if output_type == OutputType.MARKDOWN: return value_display, True # Number types if output_type in [OutputType.NUMBER, OutputType.INTEGER]: # Extract first number from string match = re.search(r'([-+]?\d+\.?\d*)', value_display) if match: val = float(match.group(1)) return int(val) if output_type == OutputType.INTEGER else val, True return None, False # Date if output_type == OutputType.DATE: # Check if already ISO format if re.match(r'\d{4}-\d{2}-\d{2}', value_display): return value_display, True return value_display, False # Unknown format # String/Enum - return as-is return value_display, True # ── Enhanced Unit Inference ─────────────────────────────────────────────────── def infer_unit_strict(key: str, description: str, output_type: OutputType, placeholder_type: PlaceholderType) -> Optional[str]: """ Strict unit inference - only return unit if certain. NO units for: - Scores (dimensionless) - Correlations (dimensionless) - Percentages expressed as 0-100 scale - Classifications/enums - JSON/Markdown outputs """ key_lower = key.lower() desc_lower = description.lower() # JSON/Markdown never have units if output_type in [OutputType.JSON, OutputType.MARKDOWN, OutputType.ENUM]: return None # Scores are dimensionless (0-100 scale) if 'score' in key_lower or 'adequacy' in key_lower: return None # Correlations are dimensionless if 'correlation' in key_lower: return None # Ratios/percentages on 0-100 scale if any(x in key_lower for x in ['pct', 'ratio', 'balance', 'compliance', 'consistency']): return None # Classifications/quadrants if 'quadrant' in key_lower or 'classification' in key_lower: return None # Weight/mass if any(x in key_lower for x in ['weight', 'gewicht', 'fm_', 'lbm_', 'masse']): return 'kg' # Circumferences/lengths if any(x in key_lower for x in ['umfang', 'waist', 'hip', 'chest', 'arm', 'leg', 'delta']) and 'circumference' in desc_lower: return 'cm' # Time durations if any(x in key_lower for x in ['duration', 'dauer', 'debt']): if 'hours' in desc_lower or 'stunden' in desc_lower: return 'Stunden' elif 'minutes' in desc_lower or 'minuten' in desc_lower: return 'Minuten' return None # Unclear # Heart rate if 'rhr' in key_lower or ('hr' in key_lower and 'hrv' not in key_lower) or 'puls' in key_lower: return 'bpm' # HRV if 'hrv' in key_lower: return 'ms' # VO2 Max if 'vo2' in key_lower: return 'ml/kg/min' # Calories/energy if 'kcal' in key_lower or 'energy' in key_lower or 'energie' in key_lower: return 'kcal' # Macros (protein, carbs, fat) if any(x in key_lower for x in ['protein', 'carb', 'fat', 'kohlenhydrat', 'fett']) and 'g' in desc_lower: return 'g' # Height if 'height' in key_lower or 'größe' in key_lower: return 'cm' # Age if 'age' in key_lower or 'alter' in key_lower: return 'Jahre' # BMI is dimensionless if 'bmi' in key_lower: return None # Default: No unit (conservative) return None # ── Enhanced Time Window Detection ──────────────────────────────────────────── def detect_time_window_precise( key: str, description: str, resolver_name: str, semantic_contract: str ) -> Tuple[TimeWindow, bool, Optional[str]]: """ Detect time window with precision. Returns: (time_window, is_certain, mismatch_note) """ key_lower = key.lower() desc_lower = description.lower() contract_lower = semantic_contract.lower() # Explicit suffixes (highest confidence) if '_7d' in key_lower: return TimeWindow.DAYS_7, True, None if '_14d' in key_lower: return TimeWindow.DAYS_14, True, None if '_28d' in key_lower: return TimeWindow.DAYS_28, True, None if '_30d' in key_lower: return TimeWindow.DAYS_30, True, None if '_90d' in key_lower: return TimeWindow.DAYS_90, True, None if '_3d' in key_lower: return TimeWindow.DAYS_7, True, None # Map 3d to closest standard # Latest/current if any(x in key_lower for x in ['aktuell', 'latest', 'current', 'letzter']): return TimeWindow.LATEST, True, None # Check semantic contract for time window info if '7 tag' in contract_lower or '7d' in contract_lower: # Check for description mismatch mismatch = None if '30' in desc_lower or '28' in desc_lower: mismatch = f"Description says 30d/28d but implementation is 7d" return TimeWindow.DAYS_7, True, mismatch if '28 tag' in contract_lower or '28d' in contract_lower: mismatch = None if '7' in desc_lower and '28' not in desc_lower: mismatch = f"Description says 7d but implementation is 28d" return TimeWindow.DAYS_28, True, mismatch if '30 tag' in contract_lower or '30d' in contract_lower: return TimeWindow.DAYS_30, True, None if '90 tag' in contract_lower or '90d' in contract_lower: return TimeWindow.DAYS_90, True, None # Check description patterns if 'letzte 7' in desc_lower or '7 tag' in desc_lower: return TimeWindow.DAYS_7, False, None if 'letzte 30' in desc_lower or '30 tag' in desc_lower: return TimeWindow.DAYS_30, False, None # Averages typically 30d unless specified if 'avg' in key_lower or 'durchschn' in key_lower: if '7' in desc_lower: return TimeWindow.DAYS_7, False, None return TimeWindow.DAYS_30, False, "Assumed 30d for average (not explicit)" # Trends typically 28d if 'trend' in key_lower: return TimeWindow.DAYS_28, False, "Assumed 28d for trend" # Week-based if 'week' in key_lower or 'woche' in key_lower: return TimeWindow.DAYS_7, False, None # Profile data is latest if key_lower in ['name', 'age', 'height', 'geschlecht']: return TimeWindow.LATEST, True, None # Unknown return TimeWindow.UNKNOWN, False, "Could not determine time window from code or documentation" # ── Enhanced Source Provenance ──────────────────────────────────────────────── def resolve_real_source(resolver_name: str) -> Tuple[Optional[str], Optional[str], list, str]: """ Resolve real source function (not safe wrappers). Returns: (function, data_layer_module, source_tables, source_kind) """ # Skip safe wrappers - they're not real sources if resolver_name in ['_safe_int', '_safe_float', '_safe_json', '_safe_str']: return None, None, [], "wrapper" # Direct mappings to data layer source_map = { # Body metrics 'get_latest_weight': ('get_latest_weight_data', 'body_metrics', ['weight_log'], 'direct'), 'get_weight_trend': ('get_weight_trend_data', 'body_metrics', ['weight_log'], 'computed'), 'get_latest_bf': ('get_body_composition_data', 'body_metrics', ['caliper_log'], 'direct'), 'get_circ_summary': ('get_circumference_summary_data', 'body_metrics', ['circumference_log'], 'aggregated'), 'get_caliper_summary': ('get_body_composition_data', 'body_metrics', ['caliper_log'], 'aggregated'), 'calculate_bmi': (None, None, ['weight_log', 'profiles'], 'computed'), # Nutrition 'get_nutrition_avg': ('get_nutrition_average_data', 'nutrition_metrics', ['nutrition_log'], 'aggregated'), 'get_protein_per_kg': ('get_protein_targets_data', 'nutrition_metrics', ['nutrition_log', 'weight_log'], 'computed'), 'get_nutrition_days': ('get_nutrition_days_data', 'nutrition_metrics', ['nutrition_log'], 'computed'), # Activity 'get_activity_summary': ('get_activity_summary_data', 'activity_metrics', ['activity_log', 'training_types'], 'aggregated'), 'get_activity_detail': ('get_activity_detail_data', 'activity_metrics', ['activity_log', 'training_types'], 'aggregated'), 'get_training_type_dist': ('get_training_type_distribution_data', 'activity_metrics', ['activity_log', 'training_types'], 'aggregated'), # Sleep 'get_sleep_duration': ('get_sleep_duration_data', 'recovery_metrics', ['sleep_log'], 'aggregated'), 'get_sleep_quality': ('get_sleep_quality_data', 'recovery_metrics', ['sleep_log'], 'computed'), # Vitals 'get_resting_hr': ('get_resting_heart_rate_data', 'health_metrics', ['vitals_baseline'], 'direct'), 'get_hrv': ('get_heart_rate_variability_data', 'health_metrics', ['vitals_baseline'], 'direct'), 'get_vo2_max': ('get_vo2_max_data', 'health_metrics', ['vitals_baseline'], 'direct'), # Profile 'get_profile_data': (None, None, ['profiles'], 'direct'), 'calculate_age': (None, None, ['profiles'], 'computed'), # Goals 'get_goal_weight': (None, None, ['goals'], 'direct'), 'get_goal_bf_pct': (None, None, ['goals'], 'direct'), } if resolver_name in source_map: return source_map[resolver_name] # Goals formatting functions if resolver_name.startswith('_format_goals'): return (None, None, ['goals', 'goal_focus_contributions'], 'interpreted') # Unknown return None, None, [], "unknown" # ── Quality Filter Policy for Activity Placeholders ─────────────────────────── def create_activity_quality_policy(key: str) -> Optional[QualityFilterPolicy]: """ Create quality filter policy for activity-related placeholders. """ key_lower = key.lower() # Activity-related placeholders need quality policies if any(x in key_lower for x in ['activity', 'training', 'load', 'volume', 'quality_session', 'ability']): return QualityFilterPolicy( enabled=True, default_filter_level="quality", null_quality_handling="exclude", includes_poor=False, includes_excluded=False, notes="Activity metrics filter for quality='quality' by default. NULL quality_label excluded." ) return None # ── Confidence Logic Creation ───────────────────────────────────────────────── def create_confidence_logic(key: str, data_layer_module: Optional[str]) -> Optional[ConfidenceLogic]: """ Create confidence logic if applicable. """ key_lower = key.lower() # Data layer functions typically have confidence if data_layer_module: return ConfidenceLogic( supported=True, calculation="Based on data availability and quality thresholds", thresholds={"min_data_points": 1}, notes=f"Confidence determined by {data_layer_module}" ) # Scores have implicit confidence if 'score' in key_lower: return ConfidenceLogic( supported=True, calculation="Based on data completeness for score components", notes="Score confidence correlates with input data availability" ) # Correlations have confidence if 'correlation' in key_lower: return ConfidenceLogic( supported=True, calculation="Pearson correlation with significance testing", thresholds={"min_data_points": 7}, notes="Requires minimum 7 data points for meaningful correlation" ) return None # ── Metadata Completeness Score ─────────────────────────────────────────────── def calculate_completeness_score(metadata_dict: Dict) -> int: """ Calculate metadata completeness score (0-100). Checks: - Required fields filled - Time window not unknown - Output type not unknown - Unit specified (if applicable) - Source provenance complete - Quality/confidence policies (if applicable) """ score = 0 max_score = 100 # Required fields (30 points) if metadata_dict.get('category') and metadata_dict['category'] != 'Unknown': score += 5 if metadata_dict.get('description') and 'No description' not in metadata_dict['description']: score += 5 if metadata_dict.get('semantic_contract'): score += 10 if metadata_dict.get('source', {}).get('resolver') and metadata_dict['source']['resolver'] != 'unknown': score += 10 # Type specification (20 points) if metadata_dict.get('type') and metadata_dict['type'] != 'legacy_unknown': score += 10 if metadata_dict.get('time_window') and metadata_dict['time_window'] != 'unknown': score += 10 # Output specification (20 points) if metadata_dict.get('output_type') and metadata_dict['output_type'] != 'unknown': score += 10 if metadata_dict.get('format_hint'): score += 10 # Source provenance (20 points) source = metadata_dict.get('source', {}) if source.get('data_layer_module'): score += 10 if source.get('source_tables'): score += 10 # Quality policies (10 points) if metadata_dict.get('quality_filter_policy'): score += 5 if metadata_dict.get('confidence_logic'): score += 5 return min(score, max_score)