MAJOR CHANGES: - Enhanced metadata schema with 7 QA fields - Deterministic derivation logic (no guessing) - Conservative inference (prefer unknown over wrong) - Real source tracking (skip safe wrappers) - Legacy mismatch detection - Activity quality filter policies - Completeness scoring (0-100) - Unresolved fields tracking - Fixed ZIP/JSON export auth (query param support) FILES CHANGED: - backend/placeholder_metadata.py (schema extended) - backend/placeholder_metadata_enhanced.py (NEW, 418 lines) - backend/generate_complete_metadata_v2.py (NEW, 334 lines) - backend/tests/test_placeholder_metadata_v2.py (NEW, 302 lines) - backend/routers/prompts.py (V2 integration + auth fix) - docs/PLACEHOLDER_METADATA_VALIDATION.md (NEW, 541 lines) PROBLEMS FIXED: ✓ value_raw extraction (type-aware, JSON parsing) ✓ Units for dimensionless values (scores, correlations) ✓ Safe wrappers as sources (now skipped) ✓ Time window guessing (confidence flags) ✓ Legacy inconsistencies (marked with flag) ✓ Missing quality filters (activity placeholders) ✓ No completeness metric (0-100 score) ✓ Orphaned placeholders (tracked) ✓ Unresolved fields (explicit list) ✓ ZIP/JSON export auth (query token support for downloads) AUTH FIX: - export-catalog-zip now accepts token via query param (?token=xxx) - export-values-extended now accepts token via query param - Allows browser downloads without custom headers Konzept: docs/PLACEHOLDER_METADATA_REQUIREMENTS_V2_NORMATIVE.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
334 lines
14 KiB
Python
334 lines
14 KiB
Python
"""
|
|
Complete Metadata Generation V2 - Quality Assured
|
|
|
|
This version applies strict quality controls and enhanced extraction logic.
|
|
"""
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from placeholder_metadata import (
|
|
PlaceholderType,
|
|
TimeWindow,
|
|
OutputType,
|
|
SourceInfo,
|
|
QualityFilterPolicy,
|
|
ConfidenceLogic,
|
|
METADATA_REGISTRY
|
|
)
|
|
from placeholder_metadata_extractor import build_complete_metadata_registry
|
|
from placeholder_metadata_enhanced import (
|
|
extract_value_raw,
|
|
infer_unit_strict,
|
|
detect_time_window_precise,
|
|
resolve_real_source,
|
|
create_activity_quality_policy,
|
|
create_confidence_logic,
|
|
calculate_completeness_score
|
|
)
|
|
|
|
|
|
def apply_enhanced_corrections(registry):
|
|
"""
|
|
Apply enhanced corrections with strict quality controls.
|
|
|
|
This replaces heuristic guessing with deterministic derivation.
|
|
"""
|
|
all_metadata = registry.get_all()
|
|
|
|
for key, metadata in all_metadata.items():
|
|
unresolved = []
|
|
|
|
# ── 1. Fix value_raw ──────────────────────────────────────────────────
|
|
if metadata.value_display and metadata.value_display not in ['nicht verfügbar', '']:
|
|
raw_val, success = extract_value_raw(
|
|
metadata.value_display,
|
|
metadata.output_type,
|
|
metadata.type
|
|
)
|
|
if success:
|
|
metadata.value_raw = raw_val
|
|
else:
|
|
metadata.value_raw = None
|
|
unresolved.append('value_raw')
|
|
|
|
# ── 2. Fix unit (strict) ──────────────────────────────────────────────
|
|
strict_unit = infer_unit_strict(
|
|
key,
|
|
metadata.description,
|
|
metadata.output_type,
|
|
metadata.type
|
|
)
|
|
# Only overwrite if we have a confident answer or existing is clearly wrong
|
|
if strict_unit is not None:
|
|
metadata.unit = strict_unit
|
|
elif metadata.output_type in [OutputType.JSON, OutputType.MARKDOWN, OutputType.ENUM]:
|
|
metadata.unit = None # These never have units
|
|
elif 'score' in key.lower() or 'correlation' in key.lower():
|
|
metadata.unit = None # Dimensionless
|
|
|
|
# ── 3. Fix time_window (precise detection) ────────────────────────────
|
|
tw, is_certain, mismatch = detect_time_window_precise(
|
|
key,
|
|
metadata.description,
|
|
metadata.source.resolver,
|
|
metadata.semantic_contract
|
|
)
|
|
|
|
if is_certain:
|
|
metadata.time_window = tw
|
|
if mismatch:
|
|
metadata.legacy_contract_mismatch = True
|
|
if mismatch not in metadata.known_issues:
|
|
metadata.known_issues.append(mismatch)
|
|
else:
|
|
metadata.time_window = tw
|
|
if tw == TimeWindow.UNKNOWN:
|
|
unresolved.append('time_window')
|
|
else:
|
|
# Inferred but not certain
|
|
if mismatch and mismatch not in metadata.notes:
|
|
metadata.notes.append(f"Time window inferred: {mismatch}")
|
|
|
|
# ── 4. Fix source provenance ──────────────────────────────────────────
|
|
func, dl_module, tables, source_kind = resolve_real_source(metadata.source.resolver)
|
|
|
|
if func:
|
|
metadata.source.function = func
|
|
if dl_module:
|
|
metadata.source.data_layer_module = dl_module
|
|
if tables:
|
|
metadata.source.source_tables = tables
|
|
metadata.source.source_kind = source_kind
|
|
|
|
if source_kind == "wrapper" or source_kind == "unknown":
|
|
unresolved.append('source')
|
|
|
|
# ── 5. Add quality_filter_policy for activity placeholders ────────────
|
|
if not metadata.quality_filter_policy:
|
|
qfp = create_activity_quality_policy(key)
|
|
if qfp:
|
|
metadata.quality_filter_policy = qfp
|
|
|
|
# ── 6. Add confidence_logic ────────────────────────────────────────────
|
|
if not metadata.confidence_logic:
|
|
cl = create_confidence_logic(key, metadata.source.data_layer_module)
|
|
if cl:
|
|
metadata.confidence_logic = cl
|
|
|
|
# ── 7. Determine provenance_confidence ────────────────────────────────
|
|
if metadata.source.data_layer_module and metadata.source.source_tables:
|
|
metadata.provenance_confidence = "high"
|
|
elif metadata.source.function or metadata.source.source_tables:
|
|
metadata.provenance_confidence = "medium"
|
|
else:
|
|
metadata.provenance_confidence = "low"
|
|
|
|
# ── 8. Determine contract_source ───────────────────────────────────────
|
|
if metadata.semantic_contract and len(metadata.semantic_contract) > 50:
|
|
metadata.contract_source = "documented"
|
|
elif metadata.description:
|
|
metadata.contract_source = "inferred"
|
|
else:
|
|
metadata.contract_source = "unknown"
|
|
|
|
# ── 9. Check for orphaned placeholders ────────────────────────────────
|
|
if not metadata.used_by.prompts and not metadata.used_by.pipelines and not metadata.used_by.charts:
|
|
metadata.orphaned_placeholder = True
|
|
|
|
# ── 10. Set unresolved fields ──────────────────────────────────────────
|
|
metadata.unresolved_fields = unresolved
|
|
|
|
# ── 11. Calculate completeness score ───────────────────────────────────
|
|
metadata.metadata_completeness_score = calculate_completeness_score(metadata.to_dict())
|
|
|
|
# ── 12. Set schema status ──────────────────────────────────────────────
|
|
if metadata.metadata_completeness_score >= 80 and len(unresolved) == 0:
|
|
metadata.schema_status = "validated"
|
|
elif metadata.metadata_completeness_score >= 50:
|
|
metadata.schema_status = "draft"
|
|
else:
|
|
metadata.schema_status = "incomplete"
|
|
|
|
return registry
|
|
|
|
|
|
def generate_qa_report(registry) -> str:
|
|
"""
|
|
Generate QA report with quality metrics.
|
|
"""
|
|
all_metadata = registry.get_all()
|
|
total = len(all_metadata)
|
|
|
|
# Collect metrics
|
|
category_unknown = sum(1 for m in all_metadata.values() if m.category == "Unknown")
|
|
no_description = sum(1 for m in all_metadata.values() if not m.description or "No description" in m.description)
|
|
tw_unknown = sum(1 for m in all_metadata.values() if m.time_window == TimeWindow.UNKNOWN)
|
|
no_quality_filter = sum(1 for m in all_metadata.values() if not m.quality_filter_policy and 'activity' in m.key.lower())
|
|
no_confidence = sum(1 for m in all_metadata.values() if not m.confidence_logic and m.source.data_layer_module)
|
|
legacy_mismatch = sum(1 for m in all_metadata.values() if m.legacy_contract_mismatch)
|
|
orphaned = sum(1 for m in all_metadata.values() if m.orphaned_placeholder)
|
|
|
|
# Find problematic placeholders
|
|
problematic = []
|
|
for key, m in all_metadata.items():
|
|
score = m.metadata_completeness_score
|
|
unresolved_count = len(m.unresolved_fields)
|
|
issues_count = len(m.known_issues)
|
|
|
|
problem_score = (100 - score) + (unresolved_count * 10) + (issues_count * 5)
|
|
if problem_score > 0:
|
|
problematic.append((key, problem_score, score, unresolved_count, issues_count))
|
|
|
|
problematic.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
# Build report
|
|
lines = [
|
|
"# Placeholder Metadata QA Report",
|
|
"",
|
|
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
f"**Total Placeholders:** {total}",
|
|
"",
|
|
"## Quality Metrics",
|
|
"",
|
|
f"- **Category Unknown:** {category_unknown} ({category_unknown/total*100:.1f}%)",
|
|
f"- **No Description:** {no_description} ({no_description/total*100:.1f}%)",
|
|
f"- **Time Window Unknown:** {tw_unknown} ({tw_unknown/total*100:.1f}%)",
|
|
f"- **Activity without Quality Filter:** {no_quality_filter}",
|
|
f"- **Data Layer without Confidence Logic:** {no_confidence}",
|
|
f"- **Legacy/Implementation Mismatch:** {legacy_mismatch}",
|
|
f"- **Orphaned (unused):** {orphaned}",
|
|
"",
|
|
"## Completeness Distribution",
|
|
"",
|
|
]
|
|
|
|
# Completeness buckets
|
|
buckets = {
|
|
"90-100%": sum(1 for m in all_metadata.values() if m.metadata_completeness_score >= 90),
|
|
"70-89%": sum(1 for m in all_metadata.values() if 70 <= m.metadata_completeness_score < 90),
|
|
"50-69%": sum(1 for m in all_metadata.values() if 50 <= m.metadata_completeness_score < 70),
|
|
"0-49%": sum(1 for m in all_metadata.values() if m.metadata_completeness_score < 50),
|
|
}
|
|
|
|
for bucket, count in buckets.items():
|
|
lines.append(f"- **{bucket}:** {count} placeholders ({count/total*100:.1f}%)")
|
|
|
|
lines.append("")
|
|
lines.append("## Top 20 Most Problematic Placeholders")
|
|
lines.append("")
|
|
lines.append("| Rank | Placeholder | Completeness | Unresolved | Issues |")
|
|
lines.append("|------|-------------|--------------|------------|--------|")
|
|
|
|
for i, (key, _, score, unresolved_count, issues_count) in enumerate(problematic[:20], 1):
|
|
lines.append(f"| {i} | `{{{{{key}}}}}` | {score}% | {unresolved_count} | {issues_count} |")
|
|
|
|
lines.append("")
|
|
lines.append("## Schema Status Distribution")
|
|
lines.append("")
|
|
|
|
status_counts = {}
|
|
for m in all_metadata.values():
|
|
status_counts[m.schema_status] = status_counts.get(m.schema_status, 0) + 1
|
|
|
|
for status, count in sorted(status_counts.items()):
|
|
lines.append(f"- **{status}:** {count} ({count/total*100:.1f}%)")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def generate_unresolved_report(registry) -> dict:
|
|
"""
|
|
Generate unresolved fields report as JSON.
|
|
"""
|
|
all_metadata = registry.get_all()
|
|
|
|
unresolved_by_placeholder = {}
|
|
unresolved_by_field = {}
|
|
|
|
for key, m in all_metadata.items():
|
|
if m.unresolved_fields:
|
|
unresolved_by_placeholder[key] = m.unresolved_fields
|
|
|
|
for field in m.unresolved_fields:
|
|
if field not in unresolved_by_field:
|
|
unresolved_by_field[field] = []
|
|
unresolved_by_field[field].append(key)
|
|
|
|
return {
|
|
"generated_at": datetime.now().isoformat(),
|
|
"total_placeholders_with_unresolved": len(unresolved_by_placeholder),
|
|
"by_placeholder": unresolved_by_placeholder,
|
|
"by_field": unresolved_by_field,
|
|
"summary": {
|
|
field: len(placeholders)
|
|
for field, placeholders in unresolved_by_field.items()
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print("="*60)
|
|
print("ENHANCED PLACEHOLDER METADATA GENERATION V2")
|
|
print("="*60)
|
|
print()
|
|
|
|
try:
|
|
# Build registry
|
|
print("Building metadata registry...")
|
|
registry = build_complete_metadata_registry()
|
|
print(f"Loaded {registry.count()} placeholders")
|
|
print()
|
|
|
|
# Apply enhanced corrections
|
|
print("Applying enhanced corrections...")
|
|
registry = apply_enhanced_corrections(registry)
|
|
print("Enhanced corrections applied")
|
|
print()
|
|
|
|
# Generate reports
|
|
print("Generating QA report...")
|
|
qa_report = generate_qa_report(registry)
|
|
qa_path = Path(__file__).parent.parent / "docs" / "PLACEHOLDER_METADATA_QA_REPORT.md"
|
|
with open(qa_path, 'w', encoding='utf-8') as f:
|
|
f.write(qa_report)
|
|
print(f"QA Report: {qa_path}")
|
|
|
|
print("Generating unresolved report...")
|
|
unresolved = generate_unresolved_report(registry)
|
|
unresolved_path = Path(__file__).parent.parent / "docs" / "PLACEHOLDER_METADATA_UNRESOLVED.json"
|
|
with open(unresolved_path, 'w', encoding='utf-8') as f:
|
|
json.dump(unresolved, f, indent=2, ensure_ascii=False)
|
|
print(f"Unresolved Report: {unresolved_path}")
|
|
|
|
# Summary
|
|
all_metadata = registry.get_all()
|
|
avg_completeness = sum(m.metadata_completeness_score for m in all_metadata.values()) / len(all_metadata)
|
|
validated_count = sum(1 for m in all_metadata.values() if m.schema_status == "validated")
|
|
|
|
print()
|
|
print("="*60)
|
|
print("SUMMARY")
|
|
print("="*60)
|
|
print(f"Total Placeholders: {len(all_metadata)}")
|
|
print(f"Average Completeness: {avg_completeness:.1f}%")
|
|
print(f"Validated: {validated_count} ({validated_count/len(all_metadata)*100:.1f}%)")
|
|
print(f"Time Window Unknown: {sum(1 for m in all_metadata.values() if m.time_window == TimeWindow.UNKNOWN)}")
|
|
print(f"Orphaned: {sum(1 for m in all_metadata.values() if m.orphaned_placeholder)}")
|
|
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"\nERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|