""" Complete Metadata Generation V2 - Quality Assured This version applies strict quality controls and enhanced extraction logic. """ import sys import json from pathlib import Path from datetime import datetime sys.path.insert(0, str(Path(__file__).parent)) from placeholder_metadata import ( PlaceholderType, TimeWindow, OutputType, SourceInfo, QualityFilterPolicy, ConfidenceLogic, METADATA_REGISTRY ) from placeholder_metadata_extractor import build_complete_metadata_registry from placeholder_metadata_enhanced import ( extract_value_raw, infer_unit_strict, detect_time_window_precise, resolve_real_source, create_activity_quality_policy, create_confidence_logic, calculate_completeness_score ) def apply_enhanced_corrections(registry): """ Apply enhanced corrections with strict quality controls. This replaces heuristic guessing with deterministic derivation. """ all_metadata = registry.get_all() for key, metadata in all_metadata.items(): unresolved = [] # ── 1. Fix value_raw ────────────────────────────────────────────────── if metadata.value_display and metadata.value_display not in ['nicht verfügbar', '']: raw_val, success = extract_value_raw( metadata.value_display, metadata.output_type, metadata.type ) if success: metadata.value_raw = raw_val else: metadata.value_raw = None unresolved.append('value_raw') # ── 2. Fix unit (strict) ────────────────────────────────────────────── strict_unit = infer_unit_strict( key, metadata.description, metadata.output_type, metadata.type ) # Only overwrite if we have a confident answer or existing is clearly wrong if strict_unit is not None: metadata.unit = strict_unit elif metadata.output_type in [OutputType.JSON, OutputType.MARKDOWN, OutputType.ENUM]: metadata.unit = None # These never have units elif 'score' in key.lower() or 'correlation' in key.lower(): metadata.unit = None # Dimensionless # ── 3. Fix time_window (precise detection) ──────────────────────────── tw, is_certain, mismatch = detect_time_window_precise( key, metadata.description, metadata.source.resolver, metadata.semantic_contract ) if is_certain: metadata.time_window = tw if mismatch: metadata.legacy_contract_mismatch = True if mismatch not in metadata.known_issues: metadata.known_issues.append(mismatch) else: metadata.time_window = tw if tw == TimeWindow.UNKNOWN: unresolved.append('time_window') else: # Inferred but not certain if mismatch and mismatch not in metadata.notes: metadata.notes.append(f"Time window inferred: {mismatch}") # ── 4. Fix source provenance ────────────────────────────────────────── func, dl_module, tables, source_kind = resolve_real_source(metadata.source.resolver) if func: metadata.source.function = func if dl_module: metadata.source.data_layer_module = dl_module if tables: metadata.source.source_tables = tables metadata.source.source_kind = source_kind if source_kind == "wrapper" or source_kind == "unknown": unresolved.append('source') # ── 5. Add quality_filter_policy for activity placeholders ──────────── if not metadata.quality_filter_policy: qfp = create_activity_quality_policy(key) if qfp: metadata.quality_filter_policy = qfp # ── 6. Add confidence_logic ──────────────────────────────────────────── if not metadata.confidence_logic: cl = create_confidence_logic(key, metadata.source.data_layer_module) if cl: metadata.confidence_logic = cl # ── 7. Determine provenance_confidence ──────────────────────────────── if metadata.source.data_layer_module and metadata.source.source_tables: metadata.provenance_confidence = "high" elif metadata.source.function or metadata.source.source_tables: metadata.provenance_confidence = "medium" else: metadata.provenance_confidence = "low" # ── 8. Determine contract_source ─────────────────────────────────────── if metadata.semantic_contract and len(metadata.semantic_contract) > 50: metadata.contract_source = "documented" elif metadata.description: metadata.contract_source = "inferred" else: metadata.contract_source = "unknown" # ── 9. Check for orphaned placeholders ──────────────────────────────── if not metadata.used_by.prompts and not metadata.used_by.pipelines and not metadata.used_by.charts: metadata.orphaned_placeholder = True # ── 10. Set unresolved fields ────────────────────────────────────────── metadata.unresolved_fields = unresolved # ── 11. Calculate completeness score ─────────────────────────────────── metadata.metadata_completeness_score = calculate_completeness_score(metadata.to_dict()) # ── 12. Set schema status ────────────────────────────────────────────── if metadata.metadata_completeness_score >= 80 and len(unresolved) == 0: metadata.schema_status = "validated" elif metadata.metadata_completeness_score >= 50: metadata.schema_status = "draft" else: metadata.schema_status = "incomplete" return registry def generate_qa_report(registry) -> str: """ Generate QA report with quality metrics. """ all_metadata = registry.get_all() total = len(all_metadata) # Collect metrics category_unknown = sum(1 for m in all_metadata.values() if m.category == "Unknown") no_description = sum(1 for m in all_metadata.values() if not m.description or "No description" in m.description) tw_unknown = sum(1 for m in all_metadata.values() if m.time_window == TimeWindow.UNKNOWN) no_quality_filter = sum(1 for m in all_metadata.values() if not m.quality_filter_policy and 'activity' in m.key.lower()) no_confidence = sum(1 for m in all_metadata.values() if not m.confidence_logic and m.source.data_layer_module) legacy_mismatch = sum(1 for m in all_metadata.values() if m.legacy_contract_mismatch) orphaned = sum(1 for m in all_metadata.values() if m.orphaned_placeholder) # Find problematic placeholders problematic = [] for key, m in all_metadata.items(): score = m.metadata_completeness_score unresolved_count = len(m.unresolved_fields) issues_count = len(m.known_issues) problem_score = (100 - score) + (unresolved_count * 10) + (issues_count * 5) if problem_score > 0: problematic.append((key, problem_score, score, unresolved_count, issues_count)) problematic.sort(key=lambda x: x[1], reverse=True) # Build report lines = [ "# Placeholder Metadata QA Report", "", f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", f"**Total Placeholders:** {total}", "", "## Quality Metrics", "", f"- **Category Unknown:** {category_unknown} ({category_unknown/total*100:.1f}%)", f"- **No Description:** {no_description} ({no_description/total*100:.1f}%)", f"- **Time Window Unknown:** {tw_unknown} ({tw_unknown/total*100:.1f}%)", f"- **Activity without Quality Filter:** {no_quality_filter}", f"- **Data Layer without Confidence Logic:** {no_confidence}", f"- **Legacy/Implementation Mismatch:** {legacy_mismatch}", f"- **Orphaned (unused):** {orphaned}", "", "## Completeness Distribution", "", ] # Completeness buckets buckets = { "90-100%": sum(1 for m in all_metadata.values() if m.metadata_completeness_score >= 90), "70-89%": sum(1 for m in all_metadata.values() if 70 <= m.metadata_completeness_score < 90), "50-69%": sum(1 for m in all_metadata.values() if 50 <= m.metadata_completeness_score < 70), "0-49%": sum(1 for m in all_metadata.values() if m.metadata_completeness_score < 50), } for bucket, count in buckets.items(): lines.append(f"- **{bucket}:** {count} placeholders ({count/total*100:.1f}%)") lines.append("") lines.append("## Top 20 Most Problematic Placeholders") lines.append("") lines.append("| Rank | Placeholder | Completeness | Unresolved | Issues |") lines.append("|------|-------------|--------------|------------|--------|") for i, (key, _, score, unresolved_count, issues_count) in enumerate(problematic[:20], 1): lines.append(f"| {i} | `{{{{{key}}}}}` | {score}% | {unresolved_count} | {issues_count} |") lines.append("") lines.append("## Schema Status Distribution") lines.append("") status_counts = {} for m in all_metadata.values(): status_counts[m.schema_status] = status_counts.get(m.schema_status, 0) + 1 for status, count in sorted(status_counts.items()): lines.append(f"- **{status}:** {count} ({count/total*100:.1f}%)") return "\n".join(lines) def generate_unresolved_report(registry) -> dict: """ Generate unresolved fields report as JSON. """ all_metadata = registry.get_all() unresolved_by_placeholder = {} unresolved_by_field = {} for key, m in all_metadata.items(): if m.unresolved_fields: unresolved_by_placeholder[key] = m.unresolved_fields for field in m.unresolved_fields: if field not in unresolved_by_field: unresolved_by_field[field] = [] unresolved_by_field[field].append(key) return { "generated_at": datetime.now().isoformat(), "total_placeholders_with_unresolved": len(unresolved_by_placeholder), "by_placeholder": unresolved_by_placeholder, "by_field": unresolved_by_field, "summary": { field: len(placeholders) for field, placeholders in unresolved_by_field.items() } } def main(): """Main execution.""" print("="*60) print("ENHANCED PLACEHOLDER METADATA GENERATION V2") print("="*60) print() try: # Build registry print("Building metadata registry...") registry = build_complete_metadata_registry() print(f"Loaded {registry.count()} placeholders") print() # Apply enhanced corrections print("Applying enhanced corrections...") registry = apply_enhanced_corrections(registry) print("Enhanced corrections applied") print() # Generate reports print("Generating QA report...") qa_report = generate_qa_report(registry) qa_path = Path(__file__).parent.parent / "docs" / "PLACEHOLDER_METADATA_QA_REPORT.md" with open(qa_path, 'w', encoding='utf-8') as f: f.write(qa_report) print(f"QA Report: {qa_path}") print("Generating unresolved report...") unresolved = generate_unresolved_report(registry) unresolved_path = Path(__file__).parent.parent / "docs" / "PLACEHOLDER_METADATA_UNRESOLVED.json" with open(unresolved_path, 'w', encoding='utf-8') as f: json.dump(unresolved, f, indent=2, ensure_ascii=False) print(f"Unresolved Report: {unresolved_path}") # Summary all_metadata = registry.get_all() avg_completeness = sum(m.metadata_completeness_score for m in all_metadata.values()) / len(all_metadata) validated_count = sum(1 for m in all_metadata.values() if m.schema_status == "validated") print() print("="*60) print("SUMMARY") print("="*60) print(f"Total Placeholders: {len(all_metadata)}") print(f"Average Completeness: {avg_completeness:.1f}%") print(f"Validated: {validated_count} ({validated_count/len(all_metadata)*100:.1f}%)") print(f"Time Window Unknown: {sum(1 for m in all_metadata.values() if m.time_window == TimeWindow.UNKNOWN)}") print(f"Orphaned: {sum(1 for m in all_metadata.values() if m.orphaned_placeholder)}") return 0 except Exception as e: print(f"\nERROR: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": sys.exit(main())