mitai-jinkendo/backend/generate_complete_metadata.py

"""
Script to generate complete metadata for all 116 placeholders.

This script combines:
1. Automatic extraction from PLACEHOLDER_MAP
2. Manual curation of known metadata
3. Gap identification for unresolved fields

Output: Complete metadata JSON ready for export
"""
import sys
import json
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))

from placeholder_metadata import (
    PlaceholderMetadata,
    PlaceholderType,
    TimeWindow,
    OutputType,
    SourceInfo,
    ConfidenceLogic,
    ConfidenceLevel,
    METADATA_REGISTRY
)
from placeholder_metadata_extractor import build_complete_metadata_registry


# ── Manual Metadata Corrections ──────────────────────────────────────────────

def apply_manual_corrections(registry):
    """
    Apply manual corrections to automatically extracted metadata.

    This ensures 100% accuracy for fields that cannot be reliably extracted.
    """
    corrections = {
        # ── Profil ────────────────────────────────────────────────────────────
        "name": {
            "semantic_contract": "Name des Profils aus der Datenbank, keine Transformation",
        },
        "age": {
            "semantic_contract": "Berechnet aus Geburtsdatum (dob) im Profil via calculate_age()",
            "unit": "Jahre",
        },
        "height": {
            "semantic_contract": "Körpergröße aus Profil in cm, unverändert",
        },
        "geschlecht": {
            "semantic_contract": "Geschlecht aus Profil: m='männlich', w='weiblich'",
            "output_type": OutputType.ENUM,
        },

        # ── Körper ────────────────────────────────────────────────────────────
        "weight_aktuell": {
            "semantic_contract": "Letzter verfügbarer Gewichtseintrag aus weight_log, keine Mittelung oder Glättung",
            "confidence_logic": ConfidenceLogic(
                supported=True,
                calculation="Confidence = 'high' if data exists, else 'insufficient'",
                thresholds={"min_data_points": 1},
            ),
        },
        "weight_trend": {
            "semantic_contract": "Gewichtstrend-Beschreibung über 28 Tage: stabil, steigend (+X kg), sinkend (-X kg)",
            "known_issues": ["time_window_inconsistent: Description says 7d/30d, implementation uses 28d"],
            "notes": ["Consider splitting into weight_trend_7d and weight_trend_28d"],
        },
        "kf_aktuell": {
            "semantic_contract": "Letzter berechneter Körperfettanteil aus caliper_log (JPL-7 oder JPL-3 Formel)",
        },
        "caliper_summary": {
            "semantic_contract": "Strukturierte Zusammenfassung der letzten Caliper-Messungen mit Körperfettanteil und Methode",
            "notes": ["Returns formatted text summary, not JSON"],
        },
        "circ_summary": {
            "semantic_contract": "Best-of-Each Strategie: neueste Messung pro Körperstelle mit Altersangabe in Tagen",
            "time_window": TimeWindow.MIXED,
            "notes": ["Different body parts may have different timestamps"],
        },
        "recomposition_quadrant": {
            "semantic_contract": "Klassifizierung basierend auf FM/LBM Änderungen: Optimal Recomposition (FM↓ LBM↑), Fat Loss (FM↓ LBM→), Muscle Gain (FM→ LBM↑), Weight Gain (FM↑ LBM↑)",
            "type": PlaceholderType.INTERPRETED,
        },

        # ── Ernährung ─────────────────────────────────────────────────────────
        "kcal_avg": {
            "semantic_contract": "Durchschnittliche Kalorienaufnahme über 30 Tage aus nutrition_log",
        },
        "protein_avg": {
            "semantic_contract": "Durchschnittliche Proteinaufnahme in g über 30 Tage aus nutrition_log",
        },
        "carb_avg": {
            "semantic_contract": "Durchschnittliche Kohlenhydrataufnahme in g über 30 Tage aus nutrition_log",
        },
        "fat_avg": {
            "semantic_contract": "Durchschnittliche Fettaufnahme in g über 30 Tage aus nutrition_log",
        },
        "nutrition_days": {
            "semantic_contract": "Anzahl der Tage mit Ernährungsdaten in den letzten 30 Tagen",
            "output_type": OutputType.INTEGER,
        },
        "protein_ziel_low": {
            "semantic_contract": "Untere Grenze der Protein-Zielspanne (1.6 g/kg Körpergewicht)",
        },
        "protein_ziel_high": {
            "semantic_contract": "Obere Grenze der Protein-Zielspanne (2.2 g/kg Körpergewicht)",
        },
        "protein_g_per_kg": {
            "semantic_contract": "Aktuelle Proteinaufnahme normiert auf kg Körpergewicht (protein_avg / weight)",
        },

        # ── Training ──────────────────────────────────────────────────────────
        "activity_summary": {
            "semantic_contract": "Strukturierte Zusammenfassung der Trainingsaktivität der letzten 7 Tage",
            "type": PlaceholderType.RAW_DATA,
            "known_issues": ["time_window_ambiguous: Function name suggests variable window, actual implementation unclear"],
        },
        "activity_detail": {
            "semantic_contract": "Detaillierte Liste aller Trainingseinheiten mit Typ, Dauer, Intensität",
            "type": PlaceholderType.RAW_DATA,
            "known_issues": ["time_window_ambiguous: No clear time window specified"],
        },
        "trainingstyp_verteilung": {
            "semantic_contract": "Verteilung der Trainingstypen über einen Zeitraum (Anzahl Sessions pro Typ)",
            "type": PlaceholderType.RAW_DATA,
        },

        # ── Zeitraum ──────────────────────────────────────────────────────────
        "datum_heute": {
            "semantic_contract": "Aktuelles Datum im Format YYYY-MM-DD",
            "output_type": OutputType.DATE,
            "format_hint": "2026-03-29",
        },
        "zeitraum_7d": {
            "semantic_contract": "Zeitraum der letzten 7 Tage als Text",
            "format_hint": "letzte 7 Tage (2026-03-22 bis 2026-03-29)",
        },
        "zeitraum_30d": {
            "semantic_contract": "Zeitraum der letzten 30 Tage als Text",
            "format_hint": "letzte 30 Tage (2026-02-27 bis 2026-03-29)",
        },
        "zeitraum_90d": {
            "semantic_contract": "Zeitraum der letzten 90 Tage als Text",
            "format_hint": "letzte 90 Tage (2025-12-29 bis 2026-03-29)",
        },

        # ── Goals & Focus ─────────────────────────────────────────────────────
        "active_goals_json": {
            "type": PlaceholderType.RAW_DATA,
            "output_type": OutputType.JSON,
            "semantic_contract": "JSON-Array aller aktiven Ziele mit vollständigen Details",
        },
        "active_goals_md": {
            "type": PlaceholderType.RAW_DATA,
            "output_type": OutputType.MARKDOWN,
            "semantic_contract": "Markdown-formatierte Liste aller aktiven Ziele",
        },
        "focus_areas_weighted_json": {
            "type": PlaceholderType.RAW_DATA,
            "output_type": OutputType.JSON,
            "semantic_contract": "JSON-Array der gewichteten Focus Areas mit Progress",
        },
        "top_3_goals_behind_schedule": {
            "type": PlaceholderType.INTERPRETED,
            "semantic_contract": "Top 3 Ziele mit größter negativer Abweichung vom Zeitplan (Zeit-basiert)",
        },
        "top_3_goals_on_track": {
            "type": PlaceholderType.INTERPRETED,
            "semantic_contract": "Top 3 Ziele mit größter positiver Abweichung vom Zeitplan oder am besten im Plan",
        },

        # ── Scores ────────────────────────────────────────────────────────────
        "goal_progress_score": {
            "type": PlaceholderType.ATOMIC,
            "semantic_contract": "Gewichteter Durchschnitts-Fortschritt aller aktiven Ziele (0-100)",
            "unit": "%",
            "output_type": OutputType.INTEGER,
        },
        "body_progress_score": {
            "type": PlaceholderType.ATOMIC,
            "semantic_contract": "Body Progress Score basierend auf Gewicht/KFA-Ziel-Erreichung (0-100)",
            "unit": "%",
            "output_type": OutputType.INTEGER,
        },
        "nutrition_score": {
            "type": PlaceholderType.ATOMIC,
            "semantic_contract": "Nutrition Score basierend auf Protein Adequacy, Makro-Konsistenz (0-100)",
            "unit": "%",
            "output_type": OutputType.INTEGER,
        },
        "activity_score": {
            "type": PlaceholderType.ATOMIC,
            "semantic_contract": "Activity Score basierend auf Trainingsfrequenz, Qualitätssessions (0-100)",
            "unit": "%",
            "output_type": OutputType.INTEGER,
        },
        "recovery_score": {
            "type": PlaceholderType.ATOMIC,
            "semantic_contract": "Recovery Score basierend auf Schlaf, HRV, Ruhepuls (0-100)",
            "unit": "%",
            "output_type": OutputType.INTEGER,
        },

        # ── Correlations ──────────────────────────────────────────────────────
        "correlation_energy_weight_lag": {
            "type": PlaceholderType.INTERPRETED,
            "output_type": OutputType.JSON,
            "semantic_contract": "Lag-Korrelation zwischen Energiebilanz und Gewichtsänderung (3d/7d/14d)",
        },
        "correlation_protein_lbm": {
            "type": PlaceholderType.INTERPRETED,
            "output_type": OutputType.JSON,
            "semantic_contract": "Korrelation zwischen Proteinaufnahme und Magermasse-Änderung",
        },
        "plateau_detected": {
            "type": PlaceholderType.INTERPRETED,
            "output_type": OutputType.JSON,
            "semantic_contract": "Plateau-Erkennung: Gewichtsstagnation trotz Kaloriendefizit",
        },
        "top_drivers": {
            "type": PlaceholderType.INTERPRETED,
            "output_type": OutputType.JSON,
            "semantic_contract": "Top Einflussfaktoren auf Ziel-Fortschritt (sortiert nach Impact)",
        },
    }

    for key, updates in corrections.items():
        metadata = registry.get(key)
        if metadata:
            for field, value in updates.items():
                setattr(metadata, field, value)

    return registry


def export_complete_metadata(registry, output_path: str = None):
    """
    Export complete metadata to JSON file.

    Args:
        registry: PlaceholderMetadataRegistry
        output_path: Optional output file path
    """
    all_metadata = registry.get_all()

    # Convert to dict
    export_data = {
        "schema_version": "1.0.0",
        "generated_at": "2026-03-29T12:00:00Z",
        "total_placeholders": len(all_metadata),
        "placeholders": {}
    }

    for key, metadata in all_metadata.items():
        export_data["placeholders"][key] = metadata.to_dict()

    # Write to file
    if not output_path:
        output_path = Path(__file__).parent.parent / "docs" / "placeholder_metadata_complete.json"

    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)

    print(f"✓ Exported complete metadata to: {output_path}")
    return output_path


def generate_gap_report(registry):
    """
    Generate gap report showing unresolved metadata fields.
    """
    gaps = {
        "unknown_time_window": [],
        "unknown_output_type": [],
        "legacy_unknown_type": [],
        "missing_semantic_contract": [],
        "missing_data_layer_module": [],
        "missing_source_tables": [],
        "validation_issues": [],
    }

    for key, metadata in registry.get_all().items():
        if metadata.time_window == TimeWindow.UNKNOWN:
            gaps["unknown_time_window"].append(key)
        if metadata.output_type == OutputType.UNKNOWN:
            gaps["unknown_output_type"].append(key)
        if metadata.type == PlaceholderType.LEGACY_UNKNOWN:
            gaps["legacy_unknown_type"].append(key)
        if not metadata.semantic_contract or metadata.semantic_contract == metadata.description:
            gaps["missing_semantic_contract"].append(key)
        if not metadata.source.data_layer_module:
            gaps["missing_data_layer_module"].append(key)
        if not metadata.source.source_tables:
            gaps["missing_source_tables"].append(key)

    # Validation
    violations = registry.validate_all()
    for key, issues in violations.items():
        error_count = len([i for i in issues if i.severity == "error"])
        if error_count > 0:
            gaps["validation_issues"].append(key)

    return gaps


def print_summary(registry, gaps):
    """Print summary statistics."""
    all_metadata = registry.get_all()
    total = len(all_metadata)

    # Count by type
    by_type = {}
    for metadata in all_metadata.values():
        ptype = metadata.type.value
        by_type[ptype] = by_type.get(ptype, 0) + 1

    # Count by category
    by_category = {}
    for metadata in all_metadata.values():
        cat = metadata.category
        by_category[cat] = by_category.get(cat, 0) + 1

    print("\n" + "="*60)
    print("PLACEHOLDER METADATA EXTRACTION SUMMARY")
    print("="*60)
    print(f"\nTotal Placeholders: {total}")
    print(f"\nBy Type:")
    for ptype, count in sorted(by_type.items()):
        print(f"  {ptype:20} {count:3} ({count/total*100:5.1f}%)")

    print(f"\nBy Category:")
    for cat, count in sorted(by_category.items()):
        print(f"  {cat:20} {count:3} ({count/total*100:5.1f}%)")

    print(f"\nGaps & Unresolved Fields:")
    for gap_type, placeholders in gaps.items():
        if placeholders:
            print(f"  {gap_type:30} {len(placeholders):3} placeholders")

    # Coverage score
    gap_count = sum(len(v) for v in gaps.values())
    coverage = (1 - gap_count / (total * 6)) * 100  # 6 gap types
    print(f"\n Metadata Coverage: {coverage:5.1f}%")


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    """Main execution function."""
    print("Building complete placeholder metadata registry...")
    print("(This requires database access)")

    try:
        # Build registry with automatic extraction
        registry = build_complete_metadata_registry()

        # Apply manual corrections
        print("\nApplying manual corrections...")
        registry = apply_manual_corrections(registry)

        # Generate gap report
        print("\nGenerating gap report...")
        gaps = generate_gap_report(registry)

        # Print summary
        print_summary(registry, gaps)

        # Export to JSON
        print("\nExporting complete metadata...")
        output_path = export_complete_metadata(registry)

        print("\n" + "="*60)
        print("✓ COMPLETE")
        print("="*60)
        print(f"\nNext steps:")
        print(f"1. Review gaps in gap report")
        print(f"2. Manually fill remaining unresolved fields")
        print(f"3. Run validation: python -m backend.placeholder_metadata_complete")
        print(f"4. Generate catalog files: python -m backend.generate_placeholder_catalog")

        return 0

    except Exception as e:
        print(f"\n✗ ERROR: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(main())