mitai-jinkendo/backend/generate_placeholder_catalog.py

"""
Placeholder Catalog Generator

Generates comprehensive documentation for all placeholders:
1. PLACEHOLDER_CATALOG_EXTENDED.json - Machine-readable full metadata
2. PLACEHOLDER_CATALOG_EXTENDED.md - Human-readable catalog
3. PLACEHOLDER_GAP_REPORT.md - Technical gaps and issues
4. PLACEHOLDER_EXPORT_SPEC.md - Export format specification

This implements the normative standard for placeholder documentation.
"""
import sys
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))

from placeholder_metadata import (
    PlaceholderMetadata,
    PlaceholderType,
    TimeWindow,
    OutputType,
    METADATA_REGISTRY
)
from placeholder_metadata_extractor import build_complete_metadata_registry
from generate_complete_metadata import apply_manual_corrections, generate_gap_report


# ── 1. JSON Catalog ───────────────────────────────────────────────────────────

def generate_json_catalog(registry, output_dir: Path):
    """Generate PLACEHOLDER_CATALOG_EXTENDED.json"""
    all_metadata = registry.get_all()

    catalog = {
        "schema_version": "1.0.0",
        "generated_at": datetime.now().isoformat(),
        "normative_standard": "PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md",
        "total_placeholders": len(all_metadata),
        "placeholders": {}
    }

    for key, metadata in sorted(all_metadata.items()):
        catalog["placeholders"][key] = metadata.to_dict()

    output_path = output_dir / "PLACEHOLDER_CATALOG_EXTENDED.json"
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(catalog, f, indent=2, ensure_ascii=False)

    print(f"Generated: {output_path}")
    return output_path


# ── 2. Markdown Catalog ───────────────────────────────────────────────────────

def generate_markdown_catalog(registry, output_dir: Path):
    """Generate PLACEHOLDER_CATALOG_EXTENDED.md"""
    all_metadata = registry.get_all()
    by_category = registry.get_by_category()

    md = []
    md.append("# Placeholder Catalog (Extended)")
    md.append("")
    md.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    md.append(f"**Total Placeholders:** {len(all_metadata)}")
    md.append(f"**Normative Standard:** PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md")
    md.append("")
    md.append("---")
    md.append("")

    # Summary Statistics
    md.append("## Summary Statistics")
    md.append("")

    # By Type
    by_type = {}
    for metadata in all_metadata.values():
        ptype = metadata.type.value
        by_type[ptype] = by_type.get(ptype, 0) + 1

    md.append("### By Type")
    md.append("")
    md.append("| Type | Count | Percentage |")
    md.append("|------|-------|------------|")
    for ptype, count in sorted(by_type.items()):
        pct = count / len(all_metadata) * 100
        md.append(f"| {ptype} | {count} | {pct:.1f}% |")
    md.append("")

    # By Category
    md.append("### By Category")
    md.append("")
    md.append("| Category | Count |")
    md.append("|----------|-------|")
    for category, metadata_list in sorted(by_category.items()):
        md.append(f"| {category} | {len(metadata_list)} |")
    md.append("")

    md.append("---")
    md.append("")

    # Detailed Catalog by Category
    md.append("## Detailed Placeholder Catalog")
    md.append("")

    for category, metadata_list in sorted(by_category.items()):
        md.append(f"### {category} ({len(metadata_list)} placeholders)")
        md.append("")

        for metadata in sorted(metadata_list, key=lambda m: m.key):
            md.append(f"#### `{{{{{metadata.key}}}}}`")
            md.append("")
            md.append(f"**Description:** {metadata.description}")
            md.append("")
            md.append(f"**Semantic Contract:** {metadata.semantic_contract}")
            md.append("")

            # Metadata table
            md.append("| Property | Value |")
            md.append("|----------|-------|")
            md.append(f"| Type | `{metadata.type.value}` |")
            md.append(f"| Time Window | `{metadata.time_window.value}` |")
            md.append(f"| Output Type | `{metadata.output_type.value}` |")
            md.append(f"| Unit | {metadata.unit or 'None'} |")
            md.append(f"| Format Hint | {metadata.format_hint or 'None'} |")
            md.append(f"| Version | {metadata.version} |")
            md.append(f"| Deprecated | {metadata.deprecated} |")
            md.append("")

            # Source
            md.append("**Source:**")
            md.append(f"- Resolver: `{metadata.source.resolver}`")
            md.append(f"- Module: `{metadata.source.module}`")
            if metadata.source.function:
                md.append(f"- Function: `{metadata.source.function}`")
            if metadata.source.data_layer_module:
                md.append(f"- Data Layer: `{metadata.source.data_layer_module}`")
            if metadata.source.source_tables:
                tables = ", ".join([f"`{t}`" for t in metadata.source.source_tables])
                md.append(f"- Tables: {tables}")
            md.append("")

            # Known Issues
            if metadata.known_issues:
                md.append("**Known Issues:**")
                for issue in metadata.known_issues:
                    md.append(f"- {issue}")
                md.append("")

            # Notes
            if metadata.notes:
                md.append("**Notes:**")
                for note in metadata.notes:
                    md.append(f"- {note}")
                md.append("")

            md.append("---")
            md.append("")

    output_path = output_dir / "PLACEHOLDER_CATALOG_EXTENDED.md"
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(md))

    print(f"Generated: {output_path}")
    return output_path


# ── 3. Gap Report ─────────────────────────────────────────────────────────────

def generate_gap_report_md(registry, gaps: Dict, output_dir: Path):
    """Generate PLACEHOLDER_GAP_REPORT.md"""
    all_metadata = registry.get_all()
    total = len(all_metadata)

    md = []
    md.append("# Placeholder Metadata Gap Report")
    md.append("")
    md.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    md.append(f"**Total Placeholders:** {total}")
    md.append("")
    md.append("This report identifies placeholders with incomplete or unresolved metadata fields.")
    md.append("")
    md.append("---")
    md.append("")

    # Summary
    gap_count = sum(len(v) for v in gaps.values())
    coverage = (1 - gap_count / (total * 6)) * 100  # 6 gap types

    md.append("## Summary")
    md.append("")
    md.append(f"- **Total Gap Instances:** {gap_count}")
    md.append(f"- **Metadata Coverage:** {coverage:.1f}%")
    md.append("")

    # Detailed Gaps
    md.append("## Detailed Gap Analysis")
    md.append("")

    for gap_type, placeholders in sorted(gaps.items()):
        if not placeholders:
            continue

        md.append(f"### {gap_type.replace('_', ' ').title()}")
        md.append("")
        md.append(f"**Count:** {len(placeholders)}")
        md.append("")

        # Get category for each placeholder
        by_cat = {}
        for key in placeholders:
            metadata = registry.get(key)
            if metadata:
                cat = metadata.category
                if cat not in by_cat:
                    by_cat[cat] = []
                by_cat[cat].append(key)

        for category, keys in sorted(by_cat.items()):
            md.append(f"#### {category}")
            md.append("")
            for key in sorted(keys):
                md.append(f"- `{{{{{key}}}}}`")
            md.append("")

    # Recommendations
    md.append("---")
    md.append("")
    md.append("## Recommendations")
    md.append("")

    if gaps.get('unknown_time_window'):
        md.append("### Time Window Resolution")
        md.append("")
        md.append("Placeholders with unknown time windows should be analyzed to determine:")
        md.append("- Whether they use `latest`, `7d`, `28d`, `30d`, `90d`, or `custom`")
        md.append("- Document in semantic_contract if time window is variable")
        md.append("")

    if gaps.get('legacy_unknown_type'):
        md.append("### Type Classification")
        md.append("")
        md.append("Placeholders with `legacy_unknown` type should be classified as:")
        md.append("- `atomic` - Single atomic value")
        md.append("- `raw_data` - Structured raw data (JSON, lists)")
        md.append("- `interpreted` - AI-interpreted or derived values")
        md.append("")

    if gaps.get('missing_data_layer_module'):
        md.append("### Data Layer Tracking")
        md.append("")
        md.append("Placeholders without data_layer_module should be investigated:")
        md.append("- Check if they call data_layer functions")
        md.append("- Document direct database access if no data_layer function exists")
        md.append("")

    output_path = output_dir / "PLACEHOLDER_GAP_REPORT.md"
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(md))

    print(f"Generated: {output_path}")
    return output_path


# ── 4. Export Spec ────────────────────────────────────────────────────────────

def generate_export_spec_md(output_dir: Path):
    """Generate PLACEHOLDER_EXPORT_SPEC.md"""
    md = []
    md.append("# Placeholder Export Specification")
    md.append("")
    md.append(f"**Version:** 1.0.0")
    md.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    md.append(f"**Normative Standard:** PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md")
    md.append("")
    md.append("---")
    md.append("")

    # Overview
    md.append("## Overview")
    md.append("")
    md.append("The Placeholder Export API provides two endpoints:")
    md.append("")
    md.append("1. **Legacy Export** (`/api/prompts/placeholders/export-values`)")
    md.append("   - Backward-compatible format")
    md.append("   - Simple key-value pairs")
    md.append("   - Organized by category")
    md.append("")
    md.append("2. **Extended Export** (`/api/prompts/placeholders/export-values-extended`)")
    md.append("   - Complete normative metadata")
    md.append("   - Runtime value resolution")
    md.append("   - Gap analysis")
    md.append("   - Validation results")
    md.append("")

    # Extended Export Format
    md.append("## Extended Export Format")
    md.append("")
    md.append("### Root Structure")
    md.append("")
    md.append("```json")
    md.append("{")
    md.append('  "schema_version": "1.0.0",')
    md.append('  "export_date": "2026-03-29T12:00:00Z",')
    md.append('  "profile_id": "user-123",')
    md.append('  "legacy": { ... },')
    md.append('  "metadata": { ... },')
    md.append('  "validation": { ... }')
    md.append("}")
    md.append("```")
    md.append("")

    # Legacy Section
    md.append("### Legacy Section")
    md.append("")
    md.append("Maintains backward compatibility with existing export consumers.")
    md.append("")
    md.append("```json")
    md.append('"legacy": {')
    md.append('  "all_placeholders": {')
    md.append('    "weight_aktuell": "85.8 kg",')
    md.append('    "name": "Max Mustermann",')
    md.append('    ...')
    md.append('  },')
    md.append('  "placeholders_by_category": {')
    md.append('    "Körper": [')
    md.append('      {')
    md.append('        "key": "{{weight_aktuell}}",')
    md.append('        "description": "Aktuelles Gewicht in kg",')
    md.append('        "value": "85.8 kg",')
    md.append('        "example": "85.8 kg"')
    md.append('      },')
    md.append('      ...')
    md.append('    ],')
    md.append('    ...')
    md.append('  },')
    md.append('  "count": 116')
    md.append('}')
    md.append("```")
    md.append("")

    # Metadata Section
    md.append("### Metadata Section")
    md.append("")
    md.append("Complete normative metadata for all placeholders.")
    md.append("")
    md.append("```json")
    md.append('"metadata": {')
    md.append('  "flat": [')
    md.append('    {')
    md.append('      "key": "weight_aktuell",')
    md.append('      "placeholder": "{{weight_aktuell}}",')
    md.append('      "category": "Körper",')
    md.append('      "type": "atomic",')
    md.append('      "description": "Aktuelles Gewicht in kg",')
    md.append('      "semantic_contract": "Letzter verfügbarer Gewichtseintrag...",')
    md.append('      "unit": "kg",')
    md.append('      "time_window": "latest",')
    md.append('      "output_type": "number",')
    md.append('      "format_hint": "85.8 kg",')
    md.append('      "value_display": "85.8 kg",')
    md.append('      "value_raw": 85.8,')
    md.append('      "available": true,')
    md.append('      "source": {')
    md.append('        "resolver": "get_latest_weight",')
    md.append('        "module": "placeholder_resolver.py",')
    md.append('        "function": "get_latest_weight_data",')
    md.append('        "data_layer_module": "body_metrics",')
    md.append('        "source_tables": ["weight_log"]')
    md.append('      },')
    md.append('      ...')
    md.append('    },')
    md.append('    ...')
    md.append('  ],')
    md.append('  "by_category": { ... },')
    md.append('  "summary": {')
    md.append('    "total_placeholders": 116,')
    md.append('    "available": 98,')
    md.append('    "missing": 18,')
    md.append('    "by_type": {')
    md.append('      "atomic": 85,')
    md.append('      "interpreted": 20,')
    md.append('      "raw_data": 8,')
    md.append('      "legacy_unknown": 3')
    md.append('    },')
    md.append('    "coverage": {')
    md.append('      "fully_resolved": 75,')
    md.append('      "partially_resolved": 30,')
    md.append('      "unresolved": 11')
    md.append('    }')
    md.append('  },')
    md.append('  "gaps": {')
    md.append('    "unknown_time_window": ["placeholder1", ...],')
    md.append('    "missing_semantic_contract": [...],')
    md.append('    ...')
    md.append('  }')
    md.append('}')
    md.append("```")
    md.append("")

    # Validation Section
    md.append("### Validation Section")
    md.append("")
    md.append("Results of normative standard validation.")
    md.append("")
    md.append("```json")
    md.append('"validation": {')
    md.append('  "compliant": 89,')
    md.append('  "non_compliant": 27,')
    md.append('  "issues": [')
    md.append('    {')
    md.append('      "placeholder": "activity_summary",')
    md.append('      "violations": [')
    md.append('        {')
    md.append('          "field": "time_window",')
    md.append('          "issue": "Time window UNKNOWN should be resolved",')
    md.append('          "severity": "warning"')
    md.append('        }')
    md.append('      ]')
    md.append('    },')
    md.append('    ...')
    md.append('  ]')
    md.append('}')
    md.append("```")
    md.append("")

    # Usage
    md.append("## API Usage")
    md.append("")
    md.append("### Legacy Export")
    md.append("")
    md.append("```bash")
    md.append("GET /api/prompts/placeholders/export-values")
    md.append("Header: X-Auth-Token: <token>")
    md.append("```")
    md.append("")

    md.append("### Extended Export")
    md.append("")
    md.append("```bash")
    md.append("GET /api/prompts/placeholders/export-values-extended")
    md.append("Header: X-Auth-Token: <token>")
    md.append("```")
    md.append("")

    # Standards Compliance
    md.append("## Standards Compliance")
    md.append("")
    md.append("The extended export implements the following normative requirements:")
    md.append("")
    md.append("1. **Non-Breaking:** Legacy export remains unchanged")
    md.append("2. **Complete Metadata:** All fields from normative standard")
    md.append("3. **Runtime Resolution:** Values resolved for current profile")
    md.append("4. **Gap Transparency:** Unresolved fields explicitly marked")
    md.append("5. **Validation:** Automated compliance checking")
    md.append("6. **Versioning:** Schema version for future evolution")
    md.append("")

    output_path = output_dir / "PLACEHOLDER_EXPORT_SPEC.md"
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(md))

    print(f"Generated: {output_path}")
    return output_path


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    """Main catalog generation function."""
    print("="*60)
    print("PLACEHOLDER CATALOG GENERATOR")
    print("="*60)
    print()

    # Setup output directory
    output_dir = Path(__file__).parent.parent / "docs"
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"Output directory: {output_dir}")
    print()

    try:
        # Build registry
        print("Building metadata registry...")
        registry = build_complete_metadata_registry()
        registry = apply_manual_corrections(registry)
        print(f"Loaded {registry.count()} placeholders")
        print()

        # Generate gap report data
        print("Analyzing gaps...")
        gaps = generate_gap_report(registry)
        print()

        # Generate all documentation files
        print("Generating documentation files...")
        print()

        generate_json_catalog(registry, output_dir)
        generate_markdown_catalog(registry, output_dir)
        generate_gap_report_md(registry, gaps, output_dir)
        generate_export_spec_md(output_dir)

        print()
        print("="*60)
        print("CATALOG GENERATION COMPLETE")
        print("="*60)
        print()
        print("Generated files:")
        print(f"  1. {output_dir}/PLACEHOLDER_CATALOG_EXTENDED.json")
        print(f"  2. {output_dir}/PLACEHOLDER_CATALOG_EXTENDED.md")
        print(f"  3. {output_dir}/PLACEHOLDER_GAP_REPORT.md")
        print(f"  4. {output_dir}/PLACEHOLDER_EXPORT_SPEC.md")
        print()

        return 0

    except Exception as e:
        print()
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(main())