""" Script to generate complete metadata for all 116 placeholders. This script combines: 1. Automatic extraction from PLACEHOLDER_MAP 2. Manual curation of known metadata 3. Gap identification for unresolved fields Output: Complete metadata JSON ready for export """ import sys import json from pathlib import Path # Add backend to path sys.path.insert(0, str(Path(__file__).parent)) from placeholder_metadata import ( PlaceholderMetadata, PlaceholderType, TimeWindow, OutputType, SourceInfo, ConfidenceLogic, ConfidenceLevel, METADATA_REGISTRY ) from placeholder_metadata_extractor import build_complete_metadata_registry # ── Manual Metadata Corrections ────────────────────────────────────────────── def apply_manual_corrections(registry): """ Apply manual corrections to automatically extracted metadata. This ensures 100% accuracy for fields that cannot be reliably extracted. """ corrections = { # ── Profil ──────────────────────────────────────────────────────────── "name": { "semantic_contract": "Name des Profils aus der Datenbank, keine Transformation", }, "age": { "semantic_contract": "Berechnet aus Geburtsdatum (dob) im Profil via calculate_age()", "unit": "Jahre", }, "height": { "semantic_contract": "Körpergröße aus Profil in cm, unverändert", }, "geschlecht": { "semantic_contract": "Geschlecht aus Profil: m='männlich', w='weiblich'", "output_type": OutputType.ENUM, }, # ── Körper ──────────────────────────────────────────────────────────── "weight_aktuell": { "semantic_contract": "Letzter verfügbarer Gewichtseintrag aus weight_log, keine Mittelung oder Glättung", "confidence_logic": ConfidenceLogic( supported=True, calculation="Confidence = 'high' if data exists, else 'insufficient'", thresholds={"min_data_points": 1}, ), }, "weight_trend": { "semantic_contract": "Gewichtstrend-Beschreibung über 28 Tage: stabil, steigend (+X kg), sinkend (-X kg)", "known_issues": ["time_window_inconsistent: Description says 7d/30d, implementation uses 28d"], "notes": ["Consider splitting into weight_trend_7d and weight_trend_28d"], }, "kf_aktuell": { "semantic_contract": "Letzter berechneter Körperfettanteil aus caliper_log (JPL-7 oder JPL-3 Formel)", }, "caliper_summary": { "semantic_contract": "Strukturierte Zusammenfassung der letzten Caliper-Messungen mit Körperfettanteil und Methode", "notes": ["Returns formatted text summary, not JSON"], }, "circ_summary": { "semantic_contract": "Best-of-Each Strategie: neueste Messung pro Körperstelle mit Altersangabe in Tagen", "time_window": TimeWindow.MIXED, "notes": ["Different body parts may have different timestamps"], }, "recomposition_quadrant": { "semantic_contract": "Klassifizierung basierend auf FM/LBM Änderungen: Optimal Recomposition (FM↓ LBM↑), Fat Loss (FM↓ LBM→), Muscle Gain (FM→ LBM↑), Weight Gain (FM↑ LBM↑)", "type": PlaceholderType.INTERPRETED, }, # ── Ernährung ───────────────────────────────────────────────────────── "kcal_avg": { "semantic_contract": "Durchschnittliche Kalorienaufnahme über 30 Tage aus nutrition_log", }, "protein_avg": { "semantic_contract": "Durchschnittliche Proteinaufnahme in g über 30 Tage aus nutrition_log", }, "carb_avg": { "semantic_contract": "Durchschnittliche Kohlenhydrataufnahme in g über 30 Tage aus nutrition_log", }, "fat_avg": { "semantic_contract": "Durchschnittliche Fettaufnahme in g über 30 Tage aus nutrition_log", }, "nutrition_days": { "semantic_contract": "Anzahl der Tage mit Ernährungsdaten in den letzten 30 Tagen", "output_type": OutputType.INTEGER, }, "protein_ziel_low": { "semantic_contract": "Untere Grenze der Protein-Zielspanne (1.6 g/kg Körpergewicht)", }, "protein_ziel_high": { "semantic_contract": "Obere Grenze der Protein-Zielspanne (2.2 g/kg Körpergewicht)", }, "protein_g_per_kg": { "semantic_contract": "Aktuelle Proteinaufnahme normiert auf kg Körpergewicht (protein_avg / weight)", }, # ── Training ────────────────────────────────────────────────────────── "activity_summary": { "semantic_contract": "Strukturierte Zusammenfassung der Trainingsaktivität der letzten 7 Tage", "type": PlaceholderType.RAW_DATA, "known_issues": ["time_window_ambiguous: Function name suggests variable window, actual implementation unclear"], }, "activity_detail": { "semantic_contract": "Detaillierte Liste aller Trainingseinheiten mit Typ, Dauer, Intensität", "type": PlaceholderType.RAW_DATA, "known_issues": ["time_window_ambiguous: No clear time window specified"], }, "trainingstyp_verteilung": { "semantic_contract": "Verteilung der Trainingstypen über einen Zeitraum (Anzahl Sessions pro Typ)", "type": PlaceholderType.RAW_DATA, }, # ── Zeitraum ────────────────────────────────────────────────────────── "datum_heute": { "semantic_contract": "Aktuelles Datum im Format YYYY-MM-DD", "output_type": OutputType.DATE, "format_hint": "2026-03-29", }, "zeitraum_7d": { "semantic_contract": "Zeitraum der letzten 7 Tage als Text", "format_hint": "letzte 7 Tage (2026-03-22 bis 2026-03-29)", }, "zeitraum_30d": { "semantic_contract": "Zeitraum der letzten 30 Tage als Text", "format_hint": "letzte 30 Tage (2026-02-27 bis 2026-03-29)", }, "zeitraum_90d": { "semantic_contract": "Zeitraum der letzten 90 Tage als Text", "format_hint": "letzte 90 Tage (2025-12-29 bis 2026-03-29)", }, # ── Goals & Focus ───────────────────────────────────────────────────── "active_goals_json": { "type": PlaceholderType.RAW_DATA, "output_type": OutputType.JSON, "semantic_contract": "JSON-Array aller aktiven Ziele mit vollständigen Details", }, "active_goals_md": { "type": PlaceholderType.RAW_DATA, "output_type": OutputType.MARKDOWN, "semantic_contract": "Markdown-formatierte Liste aller aktiven Ziele", }, "focus_areas_weighted_json": { "type": PlaceholderType.RAW_DATA, "output_type": OutputType.JSON, "semantic_contract": "JSON-Array der gewichteten Focus Areas mit Progress", }, "top_3_goals_behind_schedule": { "type": PlaceholderType.INTERPRETED, "semantic_contract": "Top 3 Ziele mit größter negativer Abweichung vom Zeitplan (Zeit-basiert)", }, "top_3_goals_on_track": { "type": PlaceholderType.INTERPRETED, "semantic_contract": "Top 3 Ziele mit größter positiver Abweichung vom Zeitplan oder am besten im Plan", }, # ── Scores ──────────────────────────────────────────────────────────── "goal_progress_score": { "type": PlaceholderType.ATOMIC, "semantic_contract": "Gewichteter Durchschnitts-Fortschritt aller aktiven Ziele (0-100)", "unit": "%", "output_type": OutputType.INTEGER, }, "body_progress_score": { "type": PlaceholderType.ATOMIC, "semantic_contract": "Body Progress Score basierend auf Gewicht/KFA-Ziel-Erreichung (0-100)", "unit": "%", "output_type": OutputType.INTEGER, }, "nutrition_score": { "type": PlaceholderType.ATOMIC, "semantic_contract": "Nutrition Score basierend auf Protein Adequacy, Makro-Konsistenz (0-100)", "unit": "%", "output_type": OutputType.INTEGER, }, "activity_score": { "type": PlaceholderType.ATOMIC, "semantic_contract": "Activity Score basierend auf Trainingsfrequenz, Qualitätssessions (0-100)", "unit": "%", "output_type": OutputType.INTEGER, }, "recovery_score": { "type": PlaceholderType.ATOMIC, "semantic_contract": "Recovery Score basierend auf Schlaf, HRV, Ruhepuls (0-100)", "unit": "%", "output_type": OutputType.INTEGER, }, # ── Correlations ────────────────────────────────────────────────────── "correlation_energy_weight_lag": { "type": PlaceholderType.INTERPRETED, "output_type": OutputType.JSON, "semantic_contract": "Lag-Korrelation zwischen Energiebilanz und Gewichtsänderung (3d/7d/14d)", }, "correlation_protein_lbm": { "type": PlaceholderType.INTERPRETED, "output_type": OutputType.JSON, "semantic_contract": "Korrelation zwischen Proteinaufnahme und Magermasse-Änderung", }, "plateau_detected": { "type": PlaceholderType.INTERPRETED, "output_type": OutputType.JSON, "semantic_contract": "Plateau-Erkennung: Gewichtsstagnation trotz Kaloriendefizit", }, "top_drivers": { "type": PlaceholderType.INTERPRETED, "output_type": OutputType.JSON, "semantic_contract": "Top Einflussfaktoren auf Ziel-Fortschritt (sortiert nach Impact)", }, } for key, updates in corrections.items(): metadata = registry.get(key) if metadata: for field, value in updates.items(): setattr(metadata, field, value) return registry def export_complete_metadata(registry, output_path: str = None): """ Export complete metadata to JSON file. Args: registry: PlaceholderMetadataRegistry output_path: Optional output file path """ all_metadata = registry.get_all() # Convert to dict export_data = { "schema_version": "1.0.0", "generated_at": "2026-03-29T12:00:00Z", "total_placeholders": len(all_metadata), "placeholders": {} } for key, metadata in all_metadata.items(): export_data["placeholders"][key] = metadata.to_dict() # Write to file if not output_path: output_path = Path(__file__).parent.parent / "docs" / "placeholder_metadata_complete.json" output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(export_data, f, indent=2, ensure_ascii=False) print(f"✓ Exported complete metadata to: {output_path}") return output_path def generate_gap_report(registry): """ Generate gap report showing unresolved metadata fields. """ gaps = { "unknown_time_window": [], "unknown_output_type": [], "legacy_unknown_type": [], "missing_semantic_contract": [], "missing_data_layer_module": [], "missing_source_tables": [], "validation_issues": [], } for key, metadata in registry.get_all().items(): if metadata.time_window == TimeWindow.UNKNOWN: gaps["unknown_time_window"].append(key) if metadata.output_type == OutputType.UNKNOWN: gaps["unknown_output_type"].append(key) if metadata.type == PlaceholderType.LEGACY_UNKNOWN: gaps["legacy_unknown_type"].append(key) if not metadata.semantic_contract or metadata.semantic_contract == metadata.description: gaps["missing_semantic_contract"].append(key) if not metadata.source.data_layer_module: gaps["missing_data_layer_module"].append(key) if not metadata.source.source_tables: gaps["missing_source_tables"].append(key) # Validation violations = registry.validate_all() for key, issues in violations.items(): error_count = len([i for i in issues if i.severity == "error"]) if error_count > 0: gaps["validation_issues"].append(key) return gaps def print_summary(registry, gaps): """Print summary statistics.""" all_metadata = registry.get_all() total = len(all_metadata) # Count by type by_type = {} for metadata in all_metadata.values(): ptype = metadata.type.value by_type[ptype] = by_type.get(ptype, 0) + 1 # Count by category by_category = {} for metadata in all_metadata.values(): cat = metadata.category by_category[cat] = by_category.get(cat, 0) + 1 print("\n" + "="*60) print("PLACEHOLDER METADATA EXTRACTION SUMMARY") print("="*60) print(f"\nTotal Placeholders: {total}") print(f"\nBy Type:") for ptype, count in sorted(by_type.items()): print(f" {ptype:20} {count:3} ({count/total*100:5.1f}%)") print(f"\nBy Category:") for cat, count in sorted(by_category.items()): print(f" {cat:20} {count:3} ({count/total*100:5.1f}%)") print(f"\nGaps & Unresolved Fields:") for gap_type, placeholders in gaps.items(): if placeholders: print(f" {gap_type:30} {len(placeholders):3} placeholders") # Coverage score gap_count = sum(len(v) for v in gaps.values()) coverage = (1 - gap_count / (total * 6)) * 100 # 6 gap types print(f"\n Metadata Coverage: {coverage:5.1f}%") # ── Main ────────────────────────────────────────────────────────────────────── def main(): """Main execution function.""" print("Building complete placeholder metadata registry...") print("(This requires database access)") try: # Build registry with automatic extraction registry = build_complete_metadata_registry() # Apply manual corrections print("\nApplying manual corrections...") registry = apply_manual_corrections(registry) # Generate gap report print("\nGenerating gap report...") gaps = generate_gap_report(registry) # Print summary print_summary(registry, gaps) # Export to JSON print("\nExporting complete metadata...") output_path = export_complete_metadata(registry) print("\n" + "="*60) print("✓ COMPLETE") print("="*60) print(f"\nNext steps:") print(f"1. Review gaps in gap report") print(f"2. Manually fill remaining unresolved fields") print(f"3. Run validation: python -m backend.placeholder_metadata_complete") print(f"4. Generate catalog files: python -m backend.generate_placeholder_catalog") return 0 except Exception as e: print(f"\n✗ ERROR: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(main())