mitai-jinkendo/backend/generate_placeholder_catalog.py
Lars a04e7cc042
All checks were successful
Deploy Development / deploy (push) Successful in 44s
Build Test / lint-backend (push) Successful in 0s
Build Test / build-frontend (push) Successful in 13s
feat: Complete Placeholder Metadata System (Normative Standard v1.0.0)
Implements comprehensive metadata system for all 116 placeholders according to
PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE standard.

Backend:
- placeholder_metadata.py: Complete schema (PlaceholderMetadata, Registry, Validation)
- placeholder_metadata_extractor.py: Automatic extraction with heuristics
- placeholder_metadata_complete.py: Hand-curated metadata for all 116 placeholders
- generate_complete_metadata.py: Metadata generation with manual corrections
- generate_placeholder_catalog.py: Documentation generator (4 output files)
- routers/prompts.py: New extended export endpoint (non-breaking)
- tests/test_placeholder_metadata.py: Comprehensive test suite

Documentation:
- PLACEHOLDER_GOVERNANCE.md: Mandatory governance guidelines
- PLACEHOLDER_METADATA_IMPLEMENTATION_SUMMARY.md: Complete implementation docs

Features:
- Normative compliant metadata for all 116 placeholders
- Non-breaking extended export API endpoint
- Automatic + manual metadata curation
- Validation framework with error/warning levels
- Gap reporting for unresolved fields
- Catalog generator (JSON, Markdown, Gap Report, Export Spec)
- Test suite (20+ tests)
- Governance rules for future placeholders

API:
- GET /api/prompts/placeholders/export-values-extended (NEW)
- GET /api/prompts/placeholders/export-values (unchanged, backward compatible)

Architecture:
- PlaceholderType enum: atomic, raw_data, interpreted, legacy_unknown
- TimeWindow enum: latest, 7d, 14d, 28d, 30d, 90d, custom, mixed, unknown
- OutputType enum: string, number, integer, boolean, json, markdown, date, enum
- Complete source tracking (resolver, data_layer, tables)
- Runtime value resolution
- Usage tracking (prompts, pipelines, charts)

Statistics:
- 6 new Python modules (~2500+ lines)
- 1 modified module (extended)
- 2 new documentation files
- 4 generated documentation files (to be created in Docker)
- 20+ test cases
- 116 placeholders inventoried

Next Steps:
1. Run in Docker: python /app/generate_placeholder_catalog.py
2. Test extended export endpoint
3. Verify all 116 placeholders have complete metadata

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 20:32:37 +02:00

531 lines
18 KiB
Python

"""
Placeholder Catalog Generator
Generates comprehensive documentation for all placeholders:
1. PLACEHOLDER_CATALOG_EXTENDED.json - Machine-readable full metadata
2. PLACEHOLDER_CATALOG_EXTENDED.md - Human-readable catalog
3. PLACEHOLDER_GAP_REPORT.md - Technical gaps and issues
4. PLACEHOLDER_EXPORT_SPEC.md - Export format specification
This implements the normative standard for placeholder documentation.
"""
import sys
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
from placeholder_metadata import (
PlaceholderMetadata,
PlaceholderType,
TimeWindow,
OutputType,
METADATA_REGISTRY
)
from placeholder_metadata_extractor import build_complete_metadata_registry
from generate_complete_metadata import apply_manual_corrections, generate_gap_report
# ── 1. JSON Catalog ───────────────────────────────────────────────────────────
def generate_json_catalog(registry, output_dir: Path):
"""Generate PLACEHOLDER_CATALOG_EXTENDED.json"""
all_metadata = registry.get_all()
catalog = {
"schema_version": "1.0.0",
"generated_at": datetime.now().isoformat(),
"normative_standard": "PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md",
"total_placeholders": len(all_metadata),
"placeholders": {}
}
for key, metadata in sorted(all_metadata.items()):
catalog["placeholders"][key] = metadata.to_dict()
output_path = output_dir / "PLACEHOLDER_CATALOG_EXTENDED.json"
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(catalog, f, indent=2, ensure_ascii=False)
print(f"Generated: {output_path}")
return output_path
# ── 2. Markdown Catalog ───────────────────────────────────────────────────────
def generate_markdown_catalog(registry, output_dir: Path):
"""Generate PLACEHOLDER_CATALOG_EXTENDED.md"""
all_metadata = registry.get_all()
by_category = registry.get_by_category()
md = []
md.append("# Placeholder Catalog (Extended)")
md.append("")
md.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
md.append(f"**Total Placeholders:** {len(all_metadata)}")
md.append(f"**Normative Standard:** PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md")
md.append("")
md.append("---")
md.append("")
# Summary Statistics
md.append("## Summary Statistics")
md.append("")
# By Type
by_type = {}
for metadata in all_metadata.values():
ptype = metadata.type.value
by_type[ptype] = by_type.get(ptype, 0) + 1
md.append("### By Type")
md.append("")
md.append("| Type | Count | Percentage |")
md.append("|------|-------|------------|")
for ptype, count in sorted(by_type.items()):
pct = count / len(all_metadata) * 100
md.append(f"| {ptype} | {count} | {pct:.1f}% |")
md.append("")
# By Category
md.append("### By Category")
md.append("")
md.append("| Category | Count |")
md.append("|----------|-------|")
for category, metadata_list in sorted(by_category.items()):
md.append(f"| {category} | {len(metadata_list)} |")
md.append("")
md.append("---")
md.append("")
# Detailed Catalog by Category
md.append("## Detailed Placeholder Catalog")
md.append("")
for category, metadata_list in sorted(by_category.items()):
md.append(f"### {category} ({len(metadata_list)} placeholders)")
md.append("")
for metadata in sorted(metadata_list, key=lambda m: m.key):
md.append(f"#### `{{{{{metadata.key}}}}}`")
md.append("")
md.append(f"**Description:** {metadata.description}")
md.append("")
md.append(f"**Semantic Contract:** {metadata.semantic_contract}")
md.append("")
# Metadata table
md.append("| Property | Value |")
md.append("|----------|-------|")
md.append(f"| Type | `{metadata.type.value}` |")
md.append(f"| Time Window | `{metadata.time_window.value}` |")
md.append(f"| Output Type | `{metadata.output_type.value}` |")
md.append(f"| Unit | {metadata.unit or 'None'} |")
md.append(f"| Format Hint | {metadata.format_hint or 'None'} |")
md.append(f"| Version | {metadata.version} |")
md.append(f"| Deprecated | {metadata.deprecated} |")
md.append("")
# Source
md.append("**Source:**")
md.append(f"- Resolver: `{metadata.source.resolver}`")
md.append(f"- Module: `{metadata.source.module}`")
if metadata.source.function:
md.append(f"- Function: `{metadata.source.function}`")
if metadata.source.data_layer_module:
md.append(f"- Data Layer: `{metadata.source.data_layer_module}`")
if metadata.source.source_tables:
tables = ", ".join([f"`{t}`" for t in metadata.source.source_tables])
md.append(f"- Tables: {tables}")
md.append("")
# Known Issues
if metadata.known_issues:
md.append("**Known Issues:**")
for issue in metadata.known_issues:
md.append(f"- {issue}")
md.append("")
# Notes
if metadata.notes:
md.append("**Notes:**")
for note in metadata.notes:
md.append(f"- {note}")
md.append("")
md.append("---")
md.append("")
output_path = output_dir / "PLACEHOLDER_CATALOG_EXTENDED.md"
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(md))
print(f"Generated: {output_path}")
return output_path
# ── 3. Gap Report ─────────────────────────────────────────────────────────────
def generate_gap_report_md(registry, gaps: Dict, output_dir: Path):
"""Generate PLACEHOLDER_GAP_REPORT.md"""
all_metadata = registry.get_all()
total = len(all_metadata)
md = []
md.append("# Placeholder Metadata Gap Report")
md.append("")
md.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
md.append(f"**Total Placeholders:** {total}")
md.append("")
md.append("This report identifies placeholders with incomplete or unresolved metadata fields.")
md.append("")
md.append("---")
md.append("")
# Summary
gap_count = sum(len(v) for v in gaps.values())
coverage = (1 - gap_count / (total * 6)) * 100 # 6 gap types
md.append("## Summary")
md.append("")
md.append(f"- **Total Gap Instances:** {gap_count}")
md.append(f"- **Metadata Coverage:** {coverage:.1f}%")
md.append("")
# Detailed Gaps
md.append("## Detailed Gap Analysis")
md.append("")
for gap_type, placeholders in sorted(gaps.items()):
if not placeholders:
continue
md.append(f"### {gap_type.replace('_', ' ').title()}")
md.append("")
md.append(f"**Count:** {len(placeholders)}")
md.append("")
# Get category for each placeholder
by_cat = {}
for key in placeholders:
metadata = registry.get(key)
if metadata:
cat = metadata.category
if cat not in by_cat:
by_cat[cat] = []
by_cat[cat].append(key)
for category, keys in sorted(by_cat.items()):
md.append(f"#### {category}")
md.append("")
for key in sorted(keys):
md.append(f"- `{{{{{key}}}}}`")
md.append("")
# Recommendations
md.append("---")
md.append("")
md.append("## Recommendations")
md.append("")
if gaps.get('unknown_time_window'):
md.append("### Time Window Resolution")
md.append("")
md.append("Placeholders with unknown time windows should be analyzed to determine:")
md.append("- Whether they use `latest`, `7d`, `28d`, `30d`, `90d`, or `custom`")
md.append("- Document in semantic_contract if time window is variable")
md.append("")
if gaps.get('legacy_unknown_type'):
md.append("### Type Classification")
md.append("")
md.append("Placeholders with `legacy_unknown` type should be classified as:")
md.append("- `atomic` - Single atomic value")
md.append("- `raw_data` - Structured raw data (JSON, lists)")
md.append("- `interpreted` - AI-interpreted or derived values")
md.append("")
if gaps.get('missing_data_layer_module'):
md.append("### Data Layer Tracking")
md.append("")
md.append("Placeholders without data_layer_module should be investigated:")
md.append("- Check if they call data_layer functions")
md.append("- Document direct database access if no data_layer function exists")
md.append("")
output_path = output_dir / "PLACEHOLDER_GAP_REPORT.md"
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(md))
print(f"Generated: {output_path}")
return output_path
# ── 4. Export Spec ────────────────────────────────────────────────────────────
def generate_export_spec_md(output_dir: Path):
"""Generate PLACEHOLDER_EXPORT_SPEC.md"""
md = []
md.append("# Placeholder Export Specification")
md.append("")
md.append(f"**Version:** 1.0.0")
md.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
md.append(f"**Normative Standard:** PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md")
md.append("")
md.append("---")
md.append("")
# Overview
md.append("## Overview")
md.append("")
md.append("The Placeholder Export API provides two endpoints:")
md.append("")
md.append("1. **Legacy Export** (`/api/prompts/placeholders/export-values`)")
md.append(" - Backward-compatible format")
md.append(" - Simple key-value pairs")
md.append(" - Organized by category")
md.append("")
md.append("2. **Extended Export** (`/api/prompts/placeholders/export-values-extended`)")
md.append(" - Complete normative metadata")
md.append(" - Runtime value resolution")
md.append(" - Gap analysis")
md.append(" - Validation results")
md.append("")
# Extended Export Format
md.append("## Extended Export Format")
md.append("")
md.append("### Root Structure")
md.append("")
md.append("```json")
md.append("{")
md.append(' "schema_version": "1.0.0",')
md.append(' "export_date": "2026-03-29T12:00:00Z",')
md.append(' "profile_id": "user-123",')
md.append(' "legacy": { ... },')
md.append(' "metadata": { ... },')
md.append(' "validation": { ... }')
md.append("}")
md.append("```")
md.append("")
# Legacy Section
md.append("### Legacy Section")
md.append("")
md.append("Maintains backward compatibility with existing export consumers.")
md.append("")
md.append("```json")
md.append('"legacy": {')
md.append(' "all_placeholders": {')
md.append(' "weight_aktuell": "85.8 kg",')
md.append(' "name": "Max Mustermann",')
md.append(' ...')
md.append(' },')
md.append(' "placeholders_by_category": {')
md.append(' "Körper": [')
md.append(' {')
md.append(' "key": "{{weight_aktuell}}",')
md.append(' "description": "Aktuelles Gewicht in kg",')
md.append(' "value": "85.8 kg",')
md.append(' "example": "85.8 kg"')
md.append(' },')
md.append(' ...')
md.append(' ],')
md.append(' ...')
md.append(' },')
md.append(' "count": 116')
md.append('}')
md.append("```")
md.append("")
# Metadata Section
md.append("### Metadata Section")
md.append("")
md.append("Complete normative metadata for all placeholders.")
md.append("")
md.append("```json")
md.append('"metadata": {')
md.append(' "flat": [')
md.append(' {')
md.append(' "key": "weight_aktuell",')
md.append(' "placeholder": "{{weight_aktuell}}",')
md.append(' "category": "Körper",')
md.append(' "type": "atomic",')
md.append(' "description": "Aktuelles Gewicht in kg",')
md.append(' "semantic_contract": "Letzter verfügbarer Gewichtseintrag...",')
md.append(' "unit": "kg",')
md.append(' "time_window": "latest",')
md.append(' "output_type": "number",')
md.append(' "format_hint": "85.8 kg",')
md.append(' "value_display": "85.8 kg",')
md.append(' "value_raw": 85.8,')
md.append(' "available": true,')
md.append(' "source": {')
md.append(' "resolver": "get_latest_weight",')
md.append(' "module": "placeholder_resolver.py",')
md.append(' "function": "get_latest_weight_data",')
md.append(' "data_layer_module": "body_metrics",')
md.append(' "source_tables": ["weight_log"]')
md.append(' },')
md.append(' ...')
md.append(' },')
md.append(' ...')
md.append(' ],')
md.append(' "by_category": { ... },')
md.append(' "summary": {')
md.append(' "total_placeholders": 116,')
md.append(' "available": 98,')
md.append(' "missing": 18,')
md.append(' "by_type": {')
md.append(' "atomic": 85,')
md.append(' "interpreted": 20,')
md.append(' "raw_data": 8,')
md.append(' "legacy_unknown": 3')
md.append(' },')
md.append(' "coverage": {')
md.append(' "fully_resolved": 75,')
md.append(' "partially_resolved": 30,')
md.append(' "unresolved": 11')
md.append(' }')
md.append(' },')
md.append(' "gaps": {')
md.append(' "unknown_time_window": ["placeholder1", ...],')
md.append(' "missing_semantic_contract": [...],')
md.append(' ...')
md.append(' }')
md.append('}')
md.append("```")
md.append("")
# Validation Section
md.append("### Validation Section")
md.append("")
md.append("Results of normative standard validation.")
md.append("")
md.append("```json")
md.append('"validation": {')
md.append(' "compliant": 89,')
md.append(' "non_compliant": 27,')
md.append(' "issues": [')
md.append(' {')
md.append(' "placeholder": "activity_summary",')
md.append(' "violations": [')
md.append(' {')
md.append(' "field": "time_window",')
md.append(' "issue": "Time window UNKNOWN should be resolved",')
md.append(' "severity": "warning"')
md.append(' }')
md.append(' ]')
md.append(' },')
md.append(' ...')
md.append(' ]')
md.append('}')
md.append("```")
md.append("")
# Usage
md.append("## API Usage")
md.append("")
md.append("### Legacy Export")
md.append("")
md.append("```bash")
md.append("GET /api/prompts/placeholders/export-values")
md.append("Header: X-Auth-Token: <token>")
md.append("```")
md.append("")
md.append("### Extended Export")
md.append("")
md.append("```bash")
md.append("GET /api/prompts/placeholders/export-values-extended")
md.append("Header: X-Auth-Token: <token>")
md.append("```")
md.append("")
# Standards Compliance
md.append("## Standards Compliance")
md.append("")
md.append("The extended export implements the following normative requirements:")
md.append("")
md.append("1. **Non-Breaking:** Legacy export remains unchanged")
md.append("2. **Complete Metadata:** All fields from normative standard")
md.append("3. **Runtime Resolution:** Values resolved for current profile")
md.append("4. **Gap Transparency:** Unresolved fields explicitly marked")
md.append("5. **Validation:** Automated compliance checking")
md.append("6. **Versioning:** Schema version for future evolution")
md.append("")
output_path = output_dir / "PLACEHOLDER_EXPORT_SPEC.md"
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(md))
print(f"Generated: {output_path}")
return output_path
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
"""Main catalog generation function."""
print("="*60)
print("PLACEHOLDER CATALOG GENERATOR")
print("="*60)
print()
# Setup output directory
output_dir = Path(__file__).parent.parent / "docs"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {output_dir}")
print()
try:
# Build registry
print("Building metadata registry...")
registry = build_complete_metadata_registry()
registry = apply_manual_corrections(registry)
print(f"Loaded {registry.count()} placeholders")
print()
# Generate gap report data
print("Analyzing gaps...")
gaps = generate_gap_report(registry)
print()
# Generate all documentation files
print("Generating documentation files...")
print()
generate_json_catalog(registry, output_dir)
generate_markdown_catalog(registry, output_dir)
generate_gap_report_md(registry, gaps, output_dir)
generate_export_spec_md(output_dir)
print()
print("="*60)
print("CATALOG GENERATION COMPLETE")
print("="*60)
print()
print("Generated files:")
print(f" 1. {output_dir}/PLACEHOLDER_CATALOG_EXTENDED.json")
print(f" 2. {output_dir}/PLACEHOLDER_CATALOG_EXTENDED.md")
print(f" 3. {output_dir}/PLACEHOLDER_GAP_REPORT.md")
print(f" 4. {output_dir}/PLACEHOLDER_EXPORT_SPEC.md")
print()
return 0
except Exception as e:
print()
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())