mitai-jinkendo/backend/generate_complete_metadata_v2.py
Lars 650313347f
All checks were successful
Deploy Development / deploy (push) Successful in 54s
Build Test / lint-backend (push) Successful in 1s
Build Test / build-frontend (push) Successful in 15s
feat: Placeholder Metadata V2 - Normative Implementation + ZIP Export Fix
MAJOR CHANGES:
- Enhanced metadata schema with 7 QA fields
- Deterministic derivation logic (no guessing)
- Conservative inference (prefer unknown over wrong)
- Real source tracking (skip safe wrappers)
- Legacy mismatch detection
- Activity quality filter policies
- Completeness scoring (0-100)
- Unresolved fields tracking
- Fixed ZIP/JSON export auth (query param support)

FILES CHANGED:
- backend/placeholder_metadata.py (schema extended)
- backend/placeholder_metadata_enhanced.py (NEW, 418 lines)
- backend/generate_complete_metadata_v2.py (NEW, 334 lines)
- backend/tests/test_placeholder_metadata_v2.py (NEW, 302 lines)
- backend/routers/prompts.py (V2 integration + auth fix)
- docs/PLACEHOLDER_METADATA_VALIDATION.md (NEW, 541 lines)

PROBLEMS FIXED:
✓ value_raw extraction (type-aware, JSON parsing)
✓ Units for dimensionless values (scores, correlations)
✓ Safe wrappers as sources (now skipped)
✓ Time window guessing (confidence flags)
✓ Legacy inconsistencies (marked with flag)
✓ Missing quality filters (activity placeholders)
✓ No completeness metric (0-100 score)
✓ Orphaned placeholders (tracked)
✓ Unresolved fields (explicit list)
✓ ZIP/JSON export auth (query token support for downloads)

AUTH FIX:
- export-catalog-zip now accepts token via query param (?token=xxx)
- export-values-extended now accepts token via query param
- Allows browser downloads without custom headers

Konzept: docs/PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 21:23:37 +02:00

334 lines
14 KiB
Python

"""
Complete Metadata Generation V2 - Quality Assured
This version applies strict quality controls and enhanced extraction logic.
"""
import sys
import json
from pathlib import Path
from datetime import datetime
sys.path.insert(0, str(Path(__file__).parent))
from placeholder_metadata import (
PlaceholderType,
TimeWindow,
OutputType,
SourceInfo,
QualityFilterPolicy,
ConfidenceLogic,
METADATA_REGISTRY
)
from placeholder_metadata_extractor import build_complete_metadata_registry
from placeholder_metadata_enhanced import (
extract_value_raw,
infer_unit_strict,
detect_time_window_precise,
resolve_real_source,
create_activity_quality_policy,
create_confidence_logic,
calculate_completeness_score
)
def apply_enhanced_corrections(registry):
"""
Apply enhanced corrections with strict quality controls.
This replaces heuristic guessing with deterministic derivation.
"""
all_metadata = registry.get_all()
for key, metadata in all_metadata.items():
unresolved = []
# ── 1. Fix value_raw ──────────────────────────────────────────────────
if metadata.value_display and metadata.value_display not in ['nicht verfügbar', '']:
raw_val, success = extract_value_raw(
metadata.value_display,
metadata.output_type,
metadata.type
)
if success:
metadata.value_raw = raw_val
else:
metadata.value_raw = None
unresolved.append('value_raw')
# ── 2. Fix unit (strict) ──────────────────────────────────────────────
strict_unit = infer_unit_strict(
key,
metadata.description,
metadata.output_type,
metadata.type
)
# Only overwrite if we have a confident answer or existing is clearly wrong
if strict_unit is not None:
metadata.unit = strict_unit
elif metadata.output_type in [OutputType.JSON, OutputType.MARKDOWN, OutputType.ENUM]:
metadata.unit = None # These never have units
elif 'score' in key.lower() or 'correlation' in key.lower():
metadata.unit = None # Dimensionless
# ── 3. Fix time_window (precise detection) ────────────────────────────
tw, is_certain, mismatch = detect_time_window_precise(
key,
metadata.description,
metadata.source.resolver,
metadata.semantic_contract
)
if is_certain:
metadata.time_window = tw
if mismatch:
metadata.legacy_contract_mismatch = True
if mismatch not in metadata.known_issues:
metadata.known_issues.append(mismatch)
else:
metadata.time_window = tw
if tw == TimeWindow.UNKNOWN:
unresolved.append('time_window')
else:
# Inferred but not certain
if mismatch and mismatch not in metadata.notes:
metadata.notes.append(f"Time window inferred: {mismatch}")
# ── 4. Fix source provenance ──────────────────────────────────────────
func, dl_module, tables, source_kind = resolve_real_source(metadata.source.resolver)
if func:
metadata.source.function = func
if dl_module:
metadata.source.data_layer_module = dl_module
if tables:
metadata.source.source_tables = tables
metadata.source.source_kind = source_kind
if source_kind == "wrapper" or source_kind == "unknown":
unresolved.append('source')
# ── 5. Add quality_filter_policy for activity placeholders ────────────
if not metadata.quality_filter_policy:
qfp = create_activity_quality_policy(key)
if qfp:
metadata.quality_filter_policy = qfp
# ── 6. Add confidence_logic ────────────────────────────────────────────
if not metadata.confidence_logic:
cl = create_confidence_logic(key, metadata.source.data_layer_module)
if cl:
metadata.confidence_logic = cl
# ── 7. Determine provenance_confidence ────────────────────────────────
if metadata.source.data_layer_module and metadata.source.source_tables:
metadata.provenance_confidence = "high"
elif metadata.source.function or metadata.source.source_tables:
metadata.provenance_confidence = "medium"
else:
metadata.provenance_confidence = "low"
# ── 8. Determine contract_source ───────────────────────────────────────
if metadata.semantic_contract and len(metadata.semantic_contract) > 50:
metadata.contract_source = "documented"
elif metadata.description:
metadata.contract_source = "inferred"
else:
metadata.contract_source = "unknown"
# ── 9. Check for orphaned placeholders ────────────────────────────────
if not metadata.used_by.prompts and not metadata.used_by.pipelines and not metadata.used_by.charts:
metadata.orphaned_placeholder = True
# ── 10. Set unresolved fields ──────────────────────────────────────────
metadata.unresolved_fields = unresolved
# ── 11. Calculate completeness score ───────────────────────────────────
metadata.metadata_completeness_score = calculate_completeness_score(metadata.to_dict())
# ── 12. Set schema status ──────────────────────────────────────────────
if metadata.metadata_completeness_score >= 80 and len(unresolved) == 0:
metadata.schema_status = "validated"
elif metadata.metadata_completeness_score >= 50:
metadata.schema_status = "draft"
else:
metadata.schema_status = "incomplete"
return registry
def generate_qa_report(registry) -> str:
"""
Generate QA report with quality metrics.
"""
all_metadata = registry.get_all()
total = len(all_metadata)
# Collect metrics
category_unknown = sum(1 for m in all_metadata.values() if m.category == "Unknown")
no_description = sum(1 for m in all_metadata.values() if not m.description or "No description" in m.description)
tw_unknown = sum(1 for m in all_metadata.values() if m.time_window == TimeWindow.UNKNOWN)
no_quality_filter = sum(1 for m in all_metadata.values() if not m.quality_filter_policy and 'activity' in m.key.lower())
no_confidence = sum(1 for m in all_metadata.values() if not m.confidence_logic and m.source.data_layer_module)
legacy_mismatch = sum(1 for m in all_metadata.values() if m.legacy_contract_mismatch)
orphaned = sum(1 for m in all_metadata.values() if m.orphaned_placeholder)
# Find problematic placeholders
problematic = []
for key, m in all_metadata.items():
score = m.metadata_completeness_score
unresolved_count = len(m.unresolved_fields)
issues_count = len(m.known_issues)
problem_score = (100 - score) + (unresolved_count * 10) + (issues_count * 5)
if problem_score > 0:
problematic.append((key, problem_score, score, unresolved_count, issues_count))
problematic.sort(key=lambda x: x[1], reverse=True)
# Build report
lines = [
"# Placeholder Metadata QA Report",
"",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
f"**Total Placeholders:** {total}",
"",
"## Quality Metrics",
"",
f"- **Category Unknown:** {category_unknown} ({category_unknown/total*100:.1f}%)",
f"- **No Description:** {no_description} ({no_description/total*100:.1f}%)",
f"- **Time Window Unknown:** {tw_unknown} ({tw_unknown/total*100:.1f}%)",
f"- **Activity without Quality Filter:** {no_quality_filter}",
f"- **Data Layer without Confidence Logic:** {no_confidence}",
f"- **Legacy/Implementation Mismatch:** {legacy_mismatch}",
f"- **Orphaned (unused):** {orphaned}",
"",
"## Completeness Distribution",
"",
]
# Completeness buckets
buckets = {
"90-100%": sum(1 for m in all_metadata.values() if m.metadata_completeness_score >= 90),
"70-89%": sum(1 for m in all_metadata.values() if 70 <= m.metadata_completeness_score < 90),
"50-69%": sum(1 for m in all_metadata.values() if 50 <= m.metadata_completeness_score < 70),
"0-49%": sum(1 for m in all_metadata.values() if m.metadata_completeness_score < 50),
}
for bucket, count in buckets.items():
lines.append(f"- **{bucket}:** {count} placeholders ({count/total*100:.1f}%)")
lines.append("")
lines.append("## Top 20 Most Problematic Placeholders")
lines.append("")
lines.append("| Rank | Placeholder | Completeness | Unresolved | Issues |")
lines.append("|------|-------------|--------------|------------|--------|")
for i, (key, _, score, unresolved_count, issues_count) in enumerate(problematic[:20], 1):
lines.append(f"| {i} | `{{{{{key}}}}}` | {score}% | {unresolved_count} | {issues_count} |")
lines.append("")
lines.append("## Schema Status Distribution")
lines.append("")
status_counts = {}
for m in all_metadata.values():
status_counts[m.schema_status] = status_counts.get(m.schema_status, 0) + 1
for status, count in sorted(status_counts.items()):
lines.append(f"- **{status}:** {count} ({count/total*100:.1f}%)")
return "\n".join(lines)
def generate_unresolved_report(registry) -> dict:
"""
Generate unresolved fields report as JSON.
"""
all_metadata = registry.get_all()
unresolved_by_placeholder = {}
unresolved_by_field = {}
for key, m in all_metadata.items():
if m.unresolved_fields:
unresolved_by_placeholder[key] = m.unresolved_fields
for field in m.unresolved_fields:
if field not in unresolved_by_field:
unresolved_by_field[field] = []
unresolved_by_field[field].append(key)
return {
"generated_at": datetime.now().isoformat(),
"total_placeholders_with_unresolved": len(unresolved_by_placeholder),
"by_placeholder": unresolved_by_placeholder,
"by_field": unresolved_by_field,
"summary": {
field: len(placeholders)
for field, placeholders in unresolved_by_field.items()
}
}
def main():
"""Main execution."""
print("="*60)
print("ENHANCED PLACEHOLDER METADATA GENERATION V2")
print("="*60)
print()
try:
# Build registry
print("Building metadata registry...")
registry = build_complete_metadata_registry()
print(f"Loaded {registry.count()} placeholders")
print()
# Apply enhanced corrections
print("Applying enhanced corrections...")
registry = apply_enhanced_corrections(registry)
print("Enhanced corrections applied")
print()
# Generate reports
print("Generating QA report...")
qa_report = generate_qa_report(registry)
qa_path = Path(__file__).parent.parent / "docs" / "PLACEHOLDER_METADATA_QA_REPORT.md"
with open(qa_path, 'w', encoding='utf-8') as f:
f.write(qa_report)
print(f"QA Report: {qa_path}")
print("Generating unresolved report...")
unresolved = generate_unresolved_report(registry)
unresolved_path = Path(__file__).parent.parent / "docs" / "PLACEHOLDER_METADATA_UNRESOLVED.json"
with open(unresolved_path, 'w', encoding='utf-8') as f:
json.dump(unresolved, f, indent=2, ensure_ascii=False)
print(f"Unresolved Report: {unresolved_path}")
# Summary
all_metadata = registry.get_all()
avg_completeness = sum(m.metadata_completeness_score for m in all_metadata.values()) / len(all_metadata)
validated_count = sum(1 for m in all_metadata.values() if m.schema_status == "validated")
print()
print("="*60)
print("SUMMARY")
print("="*60)
print(f"Total Placeholders: {len(all_metadata)}")
print(f"Average Completeness: {avg_completeness:.1f}%")
print(f"Validated: {validated_count} ({validated_count/len(all_metadata)*100:.1f}%)")
print(f"Time Window Unknown: {sum(1 for m in all_metadata.values() if m.time_window == TimeWindow.UNKNOWN)}")
print(f"Orphaned: {sum(1 for m in all_metadata.values() if m.orphaned_placeholder)}")
return 0
except Exception as e:
print(f"\nERROR: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())