Implement Quick Evaluation and Quality Scoring for Path QA
All checks were successful
Deploy Development / deploy (push) Successful in 40s
Test Suite / pytest-backend (push) Successful in 44s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 14s
Test Suite / k6 /health Baseline (push) Successful in 33s
Test Suite / playwright-tests (push) Successful in 1m11s
All checks were successful
Deploy Development / deploy (push) Successful in 40s
Test Suite / pytest-backend (push) Successful in 44s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 14s
Test Suite / k6 /health Baseline (push) Successful in 33s
Test Suite / playwright-tests (push) Successful in 1m11s
- Added `_quick_evaluate_steps_qa` function to streamline path quality assessment without recursive API calls, enhancing performance for slot comparisons. - Introduced `compute_deterministic_path_quality_score` to provide a heuristic quality score based on gaps and off-topic steps, improving evaluation accuracy. - Updated `_run_unified_slot_improvement_review` to utilize the new quick evaluation method, optimizing the review process and integrating quality scoring. - Enhanced `build_path_qa_summary` to include quality score calculations, ensuring comprehensive feedback on path evaluations. - Refactored related functions for improved clarity and efficiency in handling path quality assessments.
This commit is contained in:
parent
85fccdd093
commit
a1e4ad66df
|
|
@ -36,6 +36,7 @@ from planning_stage_context import build_contextualized_stage_goal, resolve_path
|
||||||
from planning_exercise_path_qa import (
|
from planning_exercise_path_qa import (
|
||||||
apply_llm_path_reorder,
|
apply_llm_path_reorder,
|
||||||
build_path_qa_summary,
|
build_path_qa_summary,
|
||||||
|
compute_deterministic_path_quality_score,
|
||||||
detect_off_topic_steps,
|
detect_off_topic_steps,
|
||||||
detect_path_gaps,
|
detect_path_gaps,
|
||||||
insert_bridge_exercises,
|
insert_bridge_exercises,
|
||||||
|
|
@ -2399,6 +2400,110 @@ def _evaluate_steps_for_compare_qa(
|
||||||
return suggest_progression_path(cur, tenant=tenant, body=eval_body)
|
return suggest_progression_path(cur, tenant=tenant, body=eval_body)
|
||||||
|
|
||||||
|
|
||||||
|
def _quick_evaluate_steps_qa(
|
||||||
|
cur,
|
||||||
|
*,
|
||||||
|
goal_query: str,
|
||||||
|
semantic_brief: PlanningSemanticBrief,
|
||||||
|
steps: Sequence[Mapping[str, Any]],
|
||||||
|
roadmap_ctx: Optional[ProgressionRoadmapContext],
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Schnelle Pfad-QS ohne rekursiven API-Lauf — für Slot-Vergleiche."""
|
||||||
|
roadmap_first = roadmap_ctx is not None
|
||||||
|
steps_list = list(steps or [])
|
||||||
|
gaps = detect_path_gaps(
|
||||||
|
cur,
|
||||||
|
steps_list,
|
||||||
|
brief=semantic_brief,
|
||||||
|
roadmap_first=roadmap_first,
|
||||||
|
)
|
||||||
|
off_topic_steps = detect_off_topic_steps(
|
||||||
|
cur,
|
||||||
|
steps_list,
|
||||||
|
brief=semantic_brief,
|
||||||
|
goal_query=goal_query,
|
||||||
|
)
|
||||||
|
multistage_qa = run_multistage_path_qa(
|
||||||
|
off_topic_steps=off_topic_steps,
|
||||||
|
stripped_off_topic=[],
|
||||||
|
gaps=gaps,
|
||||||
|
llm_qa=None,
|
||||||
|
llm_applied=False,
|
||||||
|
)
|
||||||
|
path_qa = build_path_qa_summary(
|
||||||
|
gaps=gaps,
|
||||||
|
bridge_inserts=[],
|
||||||
|
ai_proposals=[],
|
||||||
|
gap_fill_offers=[],
|
||||||
|
off_topic_steps=off_topic_steps,
|
||||||
|
stripped_off_topic=[],
|
||||||
|
llm_qa=None,
|
||||||
|
llm_applied=False,
|
||||||
|
roadmap_qa_mode="roadmap_first_lite" if roadmap_first else None,
|
||||||
|
multistage_qa=multistage_qa,
|
||||||
|
)
|
||||||
|
if path_qa.get("quality_score") is None:
|
||||||
|
path_qa["quality_score"] = compute_deterministic_path_quality_score(
|
||||||
|
gaps=gaps,
|
||||||
|
off_topic_steps=off_topic_steps,
|
||||||
|
steps=steps_list,
|
||||||
|
multistage_qa=multistage_qa,
|
||||||
|
)
|
||||||
|
return path_qa
|
||||||
|
|
||||||
|
|
||||||
|
def _off_topic_slot_indices(path_qa: Optional[Mapping[str, Any]]) -> Set[int]:
|
||||||
|
return set(_off_topic_reasons_by_slot((path_qa or {}).get("off_topic_steps") or []).keys())
|
||||||
|
|
||||||
|
|
||||||
|
def _slot_suggestion_accepted(
|
||||||
|
*,
|
||||||
|
baseline_qa: Optional[Mapping[str, Any]],
|
||||||
|
projected_qa: Optional[Mapping[str, Any]],
|
||||||
|
baseline_score: Optional[float],
|
||||||
|
projected_score: Optional[float],
|
||||||
|
diff: Mapping[str, Any],
|
||||||
|
off_topic: bool,
|
||||||
|
major_idx: int,
|
||||||
|
) -> bool:
|
||||||
|
"""Entscheidet, ob ein Slot-Vorschlag in die Liste kommt."""
|
||||||
|
base_id = diff.get("baseline_exercise_id")
|
||||||
|
prop_id = diff.get("proposed_exercise_id")
|
||||||
|
base_off = _off_topic_slot_indices(baseline_qa)
|
||||||
|
proj_off = _off_topic_slot_indices(projected_qa)
|
||||||
|
|
||||||
|
if off_topic and base_id is not None:
|
||||||
|
if major_idx in base_off and major_idx not in proj_off:
|
||||||
|
return True
|
||||||
|
if major_idx in base_off and prop_id is not None:
|
||||||
|
return _slot_diff_improves_path(diff, _quality_delta(baseline_score, projected_score), off_topic=True)
|
||||||
|
|
||||||
|
if base_id is None and prop_id is not None:
|
||||||
|
return _slot_diff_improves_path(diff, _quality_delta(baseline_score, projected_score), off_topic=False)
|
||||||
|
|
||||||
|
if base_id is not None and prop_id is not None:
|
||||||
|
if int(base_id) == int(prop_id):
|
||||||
|
return False
|
||||||
|
return _slot_diff_improves_path(diff, _quality_delta(baseline_score, projected_score), off_topic=False)
|
||||||
|
|
||||||
|
if base_id is None and prop_id is None and diff.get("proposed_is_ai_proposal"):
|
||||||
|
return _slot_diff_improves_path(
|
||||||
|
diff,
|
||||||
|
_quality_delta(baseline_score, projected_score),
|
||||||
|
off_topic=off_topic or major_idx in base_off,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _quality_delta(
|
||||||
|
baseline_score: Optional[float],
|
||||||
|
projected_score: Optional[float],
|
||||||
|
) -> Optional[float]:
|
||||||
|
if baseline_score is None or projected_score is None:
|
||||||
|
return None
|
||||||
|
return round(float(projected_score) - float(baseline_score), 4)
|
||||||
|
|
||||||
|
|
||||||
def _apply_slot_diff_to_steps(
|
def _apply_slot_diff_to_steps(
|
||||||
baseline_steps: Sequence[Mapping[str, Any]],
|
baseline_steps: Sequence[Mapping[str, Any]],
|
||||||
diff: Mapping[str, Any],
|
diff: Mapping[str, Any],
|
||||||
|
|
@ -2784,6 +2889,14 @@ def _run_unified_slot_improvement_review(
|
||||||
)
|
)
|
||||||
baseline_steps = list(qa_pack.get("steps") or baseline_steps)
|
baseline_steps = list(qa_pack.get("steps") or baseline_steps)
|
||||||
baseline_qa = qa_pack.get("path_qa") if isinstance(qa_pack.get("path_qa"), dict) else {}
|
baseline_qa = qa_pack.get("path_qa") if isinstance(qa_pack.get("path_qa"), dict) else {}
|
||||||
|
if baseline_qa.get("quality_score") is None:
|
||||||
|
baseline_qa = dict(baseline_qa)
|
||||||
|
baseline_qa["quality_score"] = compute_deterministic_path_quality_score(
|
||||||
|
gaps=baseline_qa.get("large_gaps") or [],
|
||||||
|
off_topic_steps=baseline_qa.get("off_topic_steps") or [],
|
||||||
|
steps=baseline_steps,
|
||||||
|
multistage_qa=baseline_qa,
|
||||||
|
)
|
||||||
baseline_score = _path_qa_quality_score(baseline_qa)
|
baseline_score = _path_qa_quality_score(baseline_qa)
|
||||||
gap_fill_offers = list(qa_pack.get("gap_fill_offers") or [])
|
gap_fill_offers = list(qa_pack.get("gap_fill_offers") or [])
|
||||||
off_topic_map = _off_topic_reasons_by_slot(baseline_qa.get("off_topic_steps") or [])
|
off_topic_map = _off_topic_reasons_by_slot(baseline_qa.get("off_topic_steps") or [])
|
||||||
|
|
@ -2794,14 +2907,6 @@ def _run_unified_slot_improvement_review(
|
||||||
|
|
||||||
suggestions: List[Dict[str, Any]] = []
|
suggestions: List[Dict[str, Any]] = []
|
||||||
rejected: List[Dict[str, Any]] = []
|
rejected: List[Dict[str, Any]] = []
|
||||||
scored_eval_body = body.model_copy(
|
|
||||||
update={
|
|
||||||
"include_llm_path_qa": False,
|
|
||||||
"include_ai_gap_fill": False,
|
|
||||||
"auto_rematch_after_qa": False,
|
|
||||||
"include_roadmap_preview": False,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
for step_index, stage_spec in enumerate(roadmap_ctx.stage_specs):
|
for step_index, stage_spec in enumerate(roadmap_ctx.stage_specs):
|
||||||
major_idx = int(stage_spec.major_step_index)
|
major_idx = int(stage_spec.major_step_index)
|
||||||
|
|
@ -2856,6 +2961,7 @@ def _run_unified_slot_improvement_review(
|
||||||
anchor_variant_id=anchor_variant_id,
|
anchor_variant_id=anchor_variant_id,
|
||||||
used=used_other,
|
used=used_other,
|
||||||
exclude_exercise_id=exclude_id if not off_topic else int(current_id) if current_id else None,
|
exclude_exercise_id=exclude_id if not off_topic else int(current_id) if current_id else None,
|
||||||
|
max_candidates=3,
|
||||||
)
|
)
|
||||||
|
|
||||||
accepted_for_slot = False
|
accepted_for_slot = False
|
||||||
|
|
@ -2882,22 +2988,25 @@ def _run_unified_slot_improvement_review(
|
||||||
if int(raw.get("roadmap_major_step_index", -1)) == major_idx:
|
if int(raw.get("roadmap_major_step_index", -1)) == major_idx:
|
||||||
merged_steps[i] = {**raw, **candidate, "roadmap_major_step_index": major_idx}
|
merged_steps[i] = {**raw, **candidate, "roadmap_major_step_index": major_idx}
|
||||||
break
|
break
|
||||||
eval_res = _evaluate_steps_for_compare_qa(
|
eval_res = _quick_evaluate_steps_qa(
|
||||||
cur,
|
cur,
|
||||||
tenant=tenant,
|
goal_query=goal_query,
|
||||||
body=scored_eval_body,
|
semantic_brief=semantic_brief,
|
||||||
steps=merged_steps,
|
steps=merged_steps,
|
||||||
|
roadmap_ctx=roadmap_ctx,
|
||||||
)
|
)
|
||||||
projected_qa = (
|
projected_qa = eval_res if isinstance(eval_res, dict) else None
|
||||||
eval_res.get("path_qa")
|
|
||||||
if isinstance(eval_res, dict) and isinstance(eval_res.get("path_qa"), dict)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
projected_score = _path_qa_quality_score(projected_qa)
|
projected_score = _path_qa_quality_score(projected_qa)
|
||||||
delta: Optional[float] = None
|
delta = _quality_delta(baseline_score, projected_score)
|
||||||
if baseline_score is not None and projected_score is not None:
|
improves = _slot_suggestion_accepted(
|
||||||
delta = round(projected_score - baseline_score, 4)
|
baseline_qa=baseline_qa,
|
||||||
improves = _slot_diff_improves_path(diff_stub, delta, off_topic=off_topic)
|
projected_qa=projected_qa,
|
||||||
|
baseline_score=baseline_score,
|
||||||
|
projected_score=projected_score,
|
||||||
|
diff=diff_stub,
|
||||||
|
off_topic=off_topic,
|
||||||
|
major_idx=major_idx,
|
||||||
|
)
|
||||||
suggestion_type = (
|
suggestion_type = (
|
||||||
"remove_and_replace"
|
"remove_and_replace"
|
||||||
if off_topic and current_id is not None
|
if off_topic and current_id is not None
|
||||||
|
|
@ -2990,24 +3099,25 @@ def _run_unified_slot_improvement_review(
|
||||||
"proposed_title": ai_step.get("title"),
|
"proposed_title": ai_step.get("title"),
|
||||||
}
|
}
|
||||||
merged_steps = _apply_slot_diff_to_steps(baseline_steps, diff_stub, [ai_step])
|
merged_steps = _apply_slot_diff_to_steps(baseline_steps, diff_stub, [ai_step])
|
||||||
eval_res = _evaluate_steps_for_compare_qa(
|
eval_res = _quick_evaluate_steps_qa(
|
||||||
cur,
|
cur,
|
||||||
tenant=tenant,
|
goal_query=goal_query,
|
||||||
body=scored_eval_body,
|
semantic_brief=semantic_brief,
|
||||||
steps=merged_steps,
|
steps=merged_steps,
|
||||||
|
roadmap_ctx=roadmap_ctx,
|
||||||
)
|
)
|
||||||
projected_qa = (
|
projected_qa = eval_res if isinstance(eval_res, dict) else None
|
||||||
eval_res.get("path_qa")
|
|
||||||
if isinstance(eval_res, dict) and isinstance(eval_res.get("path_qa"), dict)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
projected_score = _path_qa_quality_score(projected_qa)
|
projected_score = _path_qa_quality_score(projected_qa)
|
||||||
delta = (
|
delta = _quality_delta(baseline_score, projected_score)
|
||||||
round(projected_score - baseline_score, 4)
|
improves = _slot_suggestion_accepted(
|
||||||
if baseline_score is not None and projected_score is not None
|
baseline_qa=baseline_qa,
|
||||||
else None
|
projected_qa=projected_qa,
|
||||||
|
baseline_score=baseline_score,
|
||||||
|
projected_score=projected_score,
|
||||||
|
diff=diff_stub,
|
||||||
|
off_topic=off_topic or major_idx in _off_topic_slot_indices(baseline_qa),
|
||||||
|
major_idx=major_idx,
|
||||||
)
|
)
|
||||||
improves = _slot_diff_improves_path(diff_stub, delta, off_topic=off_topic or current_id is None)
|
|
||||||
entry = {
|
entry = {
|
||||||
**diff_stub,
|
**diff_stub,
|
||||||
"baseline_slot_status": current.get("slot_status"),
|
"baseline_slot_status": current.get("slot_status"),
|
||||||
|
|
|
||||||
|
|
@ -745,12 +745,44 @@ def build_path_qa_summary(
|
||||||
f"Schritt „{o.get('title')}“ passt nicht zum Pfad-Thema"
|
f"Schritt „{o.get('title')}“ passt nicht zum Pfad-Thema"
|
||||||
for o in off_topic
|
for o in off_topic
|
||||||
]
|
]
|
||||||
|
summary["quality_score"] = compute_deterministic_path_quality_score(
|
||||||
|
gaps=gaps,
|
||||||
|
off_topic_steps=off_topic,
|
||||||
|
steps=steps,
|
||||||
|
multistage_qa=multistage_qa,
|
||||||
|
)
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def compute_deterministic_path_quality_score(
|
||||||
|
*,
|
||||||
|
gaps: Sequence[Mapping[str, Any]],
|
||||||
|
off_topic_steps: Sequence[Mapping[str, Any]],
|
||||||
|
steps: Optional[Sequence[Mapping[str, Any]]] = None,
|
||||||
|
multistage_qa: Optional[Mapping[str, Any]] = None,
|
||||||
|
) -> float:
|
||||||
|
"""Heuristische Pfad-QS ohne LLM — Basis für Slot-Vergleiche."""
|
||||||
|
score = 0.92
|
||||||
|
score -= 0.08 * len(off_topic_steps or [])
|
||||||
|
score -= 0.05 * len(gaps or [])
|
||||||
|
if steps:
|
||||||
|
empty = sum(
|
||||||
|
1
|
||||||
|
for s in steps
|
||||||
|
if isinstance(s, dict)
|
||||||
|
and s.get("exercise_id") is None
|
||||||
|
and not s.get("is_ai_proposal")
|
||||||
|
)
|
||||||
|
score -= 0.06 * empty
|
||||||
|
hint_count = int((multistage_qa or {}).get("optimization_hint_count") or 0)
|
||||||
|
score -= min(0.14, 0.02 * hint_count)
|
||||||
|
return max(0.35, min(0.98, round(score, 4)))
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"apply_llm_path_reorder",
|
"apply_llm_path_reorder",
|
||||||
"build_path_qa_summary",
|
"build_path_qa_summary",
|
||||||
|
"compute_deterministic_path_quality_score",
|
||||||
"detect_off_topic_steps",
|
"detect_off_topic_steps",
|
||||||
"detect_path_gaps",
|
"detect_path_gaps",
|
||||||
"is_roadmap_planned_neighbor_pair",
|
"is_roadmap_planned_neighbor_pair",
|
||||||
|
|
|
||||||
21
backend/tests/test_planning_deterministic_quality_score.py
Normal file
21
backend/tests/test_planning_deterministic_quality_score.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
"""Deterministische Pfad-QS ohne LLM."""
|
||||||
|
from planning_exercise_path_qa import compute_deterministic_path_quality_score
|
||||||
|
|
||||||
|
|
||||||
|
def test_deterministic_quality_score_penalizes_off_topic():
|
||||||
|
base = compute_deterministic_path_quality_score(gaps=[], off_topic_steps=[])
|
||||||
|
with_off = compute_deterministic_path_quality_score(
|
||||||
|
gaps=[],
|
||||||
|
off_topic_steps=[{"roadmap_major_step_index": 1}],
|
||||||
|
)
|
||||||
|
assert with_off < base
|
||||||
|
|
||||||
|
|
||||||
|
def test_deterministic_quality_score_penalizes_empty_slots():
|
||||||
|
base = compute_deterministic_path_quality_score(gaps=[], off_topic_steps=[], steps=[])
|
||||||
|
with_empty = compute_deterministic_path_quality_score(
|
||||||
|
gaps=[],
|
||||||
|
off_topic_steps=[],
|
||||||
|
steps=[{"exercise_id": None}, {"exercise_id": 1}],
|
||||||
|
)
|
||||||
|
assert with_empty < base
|
||||||
|
|
@ -500,10 +500,11 @@ export default function ProgressionGraphEditor({ graphId, embedded = false, onSa
|
||||||
unified_slot_review: true,
|
unified_slot_review: true,
|
||||||
baseline_evaluate_steps: slotsToEvaluateSteps(synced),
|
baseline_evaluate_steps: slotsToEvaluateSteps(synced),
|
||||||
include_llm_intent: false,
|
include_llm_intent: false,
|
||||||
include_llm_path_qa: false,
|
|
||||||
auto_rematch_after_qa: false,
|
auto_rematch_after_qa: false,
|
||||||
})
|
})
|
||||||
setPathQa(reviewRes?.path_qa || null)
|
const qa = reviewRes?.path_qa || null
|
||||||
|
setPathQa(qa)
|
||||||
|
setDraft((prev) => (prev ? { ...prev, lastFindings: qa } : prev))
|
||||||
|
|
||||||
const compareRes = buildProgressionComparePayload(null, reviewRes)
|
const compareRes = buildProgressionComparePayload(null, reviewRes)
|
||||||
setGapFillOffers(mergeGapOffersForDraft(synced, reviewRes, reviewRes))
|
setGapFillOffers(mergeGapOffersForDraft(synced, reviewRes, reviewRes))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user