Implement Quick Evaluation and Quality Scoring for Path QA
All checks were successful
Deploy Development / deploy (push) Successful in 40s
Test Suite / pytest-backend (push) Successful in 44s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 14s
Test Suite / k6 /health Baseline (push) Successful in 33s
Test Suite / playwright-tests (push) Successful in 1m11s

- Added `_quick_evaluate_steps_qa` function to streamline path quality assessment without recursive API calls, enhancing performance for slot comparisons.
- Introduced `compute_deterministic_path_quality_score` to provide a heuristic quality score based on gaps and off-topic steps, improving evaluation accuracy.
- Updated `_run_unified_slot_improvement_review` to utilize the new quick evaluation method, optimizing the review process and integrating quality scoring.
- Enhanced `build_path_qa_summary` to include quality score calculations, ensuring comprehensive feedback on path evaluations.
- Refactored related functions for improved clarity and efficiency in handling path quality assessments.
This commit is contained in:
Lars 2026-06-13 10:27:07 +02:00
parent 85fccdd093
commit a1e4ad66df
4 changed files with 199 additions and 35 deletions

View File

@ -36,6 +36,7 @@ from planning_stage_context import build_contextualized_stage_goal, resolve_path
from planning_exercise_path_qa import ( from planning_exercise_path_qa import (
apply_llm_path_reorder, apply_llm_path_reorder,
build_path_qa_summary, build_path_qa_summary,
compute_deterministic_path_quality_score,
detect_off_topic_steps, detect_off_topic_steps,
detect_path_gaps, detect_path_gaps,
insert_bridge_exercises, insert_bridge_exercises,
@ -2399,6 +2400,110 @@ def _evaluate_steps_for_compare_qa(
return suggest_progression_path(cur, tenant=tenant, body=eval_body) return suggest_progression_path(cur, tenant=tenant, body=eval_body)
def _quick_evaluate_steps_qa(
cur,
*,
goal_query: str,
semantic_brief: PlanningSemanticBrief,
steps: Sequence[Mapping[str, Any]],
roadmap_ctx: Optional[ProgressionRoadmapContext],
) -> Dict[str, Any]:
"""Schnelle Pfad-QS ohne rekursiven API-Lauf — für Slot-Vergleiche."""
roadmap_first = roadmap_ctx is not None
steps_list = list(steps or [])
gaps = detect_path_gaps(
cur,
steps_list,
brief=semantic_brief,
roadmap_first=roadmap_first,
)
off_topic_steps = detect_off_topic_steps(
cur,
steps_list,
brief=semantic_brief,
goal_query=goal_query,
)
multistage_qa = run_multistage_path_qa(
off_topic_steps=off_topic_steps,
stripped_off_topic=[],
gaps=gaps,
llm_qa=None,
llm_applied=False,
)
path_qa = build_path_qa_summary(
gaps=gaps,
bridge_inserts=[],
ai_proposals=[],
gap_fill_offers=[],
off_topic_steps=off_topic_steps,
stripped_off_topic=[],
llm_qa=None,
llm_applied=False,
roadmap_qa_mode="roadmap_first_lite" if roadmap_first else None,
multistage_qa=multistage_qa,
)
if path_qa.get("quality_score") is None:
path_qa["quality_score"] = compute_deterministic_path_quality_score(
gaps=gaps,
off_topic_steps=off_topic_steps,
steps=steps_list,
multistage_qa=multistage_qa,
)
return path_qa
def _off_topic_slot_indices(path_qa: Optional[Mapping[str, Any]]) -> Set[int]:
return set(_off_topic_reasons_by_slot((path_qa or {}).get("off_topic_steps") or []).keys())
def _slot_suggestion_accepted(
*,
baseline_qa: Optional[Mapping[str, Any]],
projected_qa: Optional[Mapping[str, Any]],
baseline_score: Optional[float],
projected_score: Optional[float],
diff: Mapping[str, Any],
off_topic: bool,
major_idx: int,
) -> bool:
"""Entscheidet, ob ein Slot-Vorschlag in die Liste kommt."""
base_id = diff.get("baseline_exercise_id")
prop_id = diff.get("proposed_exercise_id")
base_off = _off_topic_slot_indices(baseline_qa)
proj_off = _off_topic_slot_indices(projected_qa)
if off_topic and base_id is not None:
if major_idx in base_off and major_idx not in proj_off:
return True
if major_idx in base_off and prop_id is not None:
return _slot_diff_improves_path(diff, _quality_delta(baseline_score, projected_score), off_topic=True)
if base_id is None and prop_id is not None:
return _slot_diff_improves_path(diff, _quality_delta(baseline_score, projected_score), off_topic=False)
if base_id is not None and prop_id is not None:
if int(base_id) == int(prop_id):
return False
return _slot_diff_improves_path(diff, _quality_delta(baseline_score, projected_score), off_topic=False)
if base_id is None and prop_id is None and diff.get("proposed_is_ai_proposal"):
return _slot_diff_improves_path(
diff,
_quality_delta(baseline_score, projected_score),
off_topic=off_topic or major_idx in base_off,
)
return False
def _quality_delta(
baseline_score: Optional[float],
projected_score: Optional[float],
) -> Optional[float]:
if baseline_score is None or projected_score is None:
return None
return round(float(projected_score) - float(baseline_score), 4)
def _apply_slot_diff_to_steps( def _apply_slot_diff_to_steps(
baseline_steps: Sequence[Mapping[str, Any]], baseline_steps: Sequence[Mapping[str, Any]],
diff: Mapping[str, Any], diff: Mapping[str, Any],
@ -2784,6 +2889,14 @@ def _run_unified_slot_improvement_review(
) )
baseline_steps = list(qa_pack.get("steps") or baseline_steps) baseline_steps = list(qa_pack.get("steps") or baseline_steps)
baseline_qa = qa_pack.get("path_qa") if isinstance(qa_pack.get("path_qa"), dict) else {} baseline_qa = qa_pack.get("path_qa") if isinstance(qa_pack.get("path_qa"), dict) else {}
if baseline_qa.get("quality_score") is None:
baseline_qa = dict(baseline_qa)
baseline_qa["quality_score"] = compute_deterministic_path_quality_score(
gaps=baseline_qa.get("large_gaps") or [],
off_topic_steps=baseline_qa.get("off_topic_steps") or [],
steps=baseline_steps,
multistage_qa=baseline_qa,
)
baseline_score = _path_qa_quality_score(baseline_qa) baseline_score = _path_qa_quality_score(baseline_qa)
gap_fill_offers = list(qa_pack.get("gap_fill_offers") or []) gap_fill_offers = list(qa_pack.get("gap_fill_offers") or [])
off_topic_map = _off_topic_reasons_by_slot(baseline_qa.get("off_topic_steps") or []) off_topic_map = _off_topic_reasons_by_slot(baseline_qa.get("off_topic_steps") or [])
@ -2794,14 +2907,6 @@ def _run_unified_slot_improvement_review(
suggestions: List[Dict[str, Any]] = [] suggestions: List[Dict[str, Any]] = []
rejected: List[Dict[str, Any]] = [] rejected: List[Dict[str, Any]] = []
scored_eval_body = body.model_copy(
update={
"include_llm_path_qa": False,
"include_ai_gap_fill": False,
"auto_rematch_after_qa": False,
"include_roadmap_preview": False,
}
)
for step_index, stage_spec in enumerate(roadmap_ctx.stage_specs): for step_index, stage_spec in enumerate(roadmap_ctx.stage_specs):
major_idx = int(stage_spec.major_step_index) major_idx = int(stage_spec.major_step_index)
@ -2856,6 +2961,7 @@ def _run_unified_slot_improvement_review(
anchor_variant_id=anchor_variant_id, anchor_variant_id=anchor_variant_id,
used=used_other, used=used_other,
exclude_exercise_id=exclude_id if not off_topic else int(current_id) if current_id else None, exclude_exercise_id=exclude_id if not off_topic else int(current_id) if current_id else None,
max_candidates=3,
) )
accepted_for_slot = False accepted_for_slot = False
@ -2882,22 +2988,25 @@ def _run_unified_slot_improvement_review(
if int(raw.get("roadmap_major_step_index", -1)) == major_idx: if int(raw.get("roadmap_major_step_index", -1)) == major_idx:
merged_steps[i] = {**raw, **candidate, "roadmap_major_step_index": major_idx} merged_steps[i] = {**raw, **candidate, "roadmap_major_step_index": major_idx}
break break
eval_res = _evaluate_steps_for_compare_qa( eval_res = _quick_evaluate_steps_qa(
cur, cur,
tenant=tenant, goal_query=goal_query,
body=scored_eval_body, semantic_brief=semantic_brief,
steps=merged_steps, steps=merged_steps,
roadmap_ctx=roadmap_ctx,
) )
projected_qa = ( projected_qa = eval_res if isinstance(eval_res, dict) else None
eval_res.get("path_qa")
if isinstance(eval_res, dict) and isinstance(eval_res.get("path_qa"), dict)
else None
)
projected_score = _path_qa_quality_score(projected_qa) projected_score = _path_qa_quality_score(projected_qa)
delta: Optional[float] = None delta = _quality_delta(baseline_score, projected_score)
if baseline_score is not None and projected_score is not None: improves = _slot_suggestion_accepted(
delta = round(projected_score - baseline_score, 4) baseline_qa=baseline_qa,
improves = _slot_diff_improves_path(diff_stub, delta, off_topic=off_topic) projected_qa=projected_qa,
baseline_score=baseline_score,
projected_score=projected_score,
diff=diff_stub,
off_topic=off_topic,
major_idx=major_idx,
)
suggestion_type = ( suggestion_type = (
"remove_and_replace" "remove_and_replace"
if off_topic and current_id is not None if off_topic and current_id is not None
@ -2990,24 +3099,25 @@ def _run_unified_slot_improvement_review(
"proposed_title": ai_step.get("title"), "proposed_title": ai_step.get("title"),
} }
merged_steps = _apply_slot_diff_to_steps(baseline_steps, diff_stub, [ai_step]) merged_steps = _apply_slot_diff_to_steps(baseline_steps, diff_stub, [ai_step])
eval_res = _evaluate_steps_for_compare_qa( eval_res = _quick_evaluate_steps_qa(
cur, cur,
tenant=tenant, goal_query=goal_query,
body=scored_eval_body, semantic_brief=semantic_brief,
steps=merged_steps, steps=merged_steps,
roadmap_ctx=roadmap_ctx,
) )
projected_qa = ( projected_qa = eval_res if isinstance(eval_res, dict) else None
eval_res.get("path_qa")
if isinstance(eval_res, dict) and isinstance(eval_res.get("path_qa"), dict)
else None
)
projected_score = _path_qa_quality_score(projected_qa) projected_score = _path_qa_quality_score(projected_qa)
delta = ( delta = _quality_delta(baseline_score, projected_score)
round(projected_score - baseline_score, 4) improves = _slot_suggestion_accepted(
if baseline_score is not None and projected_score is not None baseline_qa=baseline_qa,
else None projected_qa=projected_qa,
baseline_score=baseline_score,
projected_score=projected_score,
diff=diff_stub,
off_topic=off_topic or major_idx in _off_topic_slot_indices(baseline_qa),
major_idx=major_idx,
) )
improves = _slot_diff_improves_path(diff_stub, delta, off_topic=off_topic or current_id is None)
entry = { entry = {
**diff_stub, **diff_stub,
"baseline_slot_status": current.get("slot_status"), "baseline_slot_status": current.get("slot_status"),

View File

@ -745,12 +745,44 @@ def build_path_qa_summary(
f"Schritt „{o.get('title')}“ passt nicht zum Pfad-Thema" f"Schritt „{o.get('title')}“ passt nicht zum Pfad-Thema"
for o in off_topic for o in off_topic
] ]
summary["quality_score"] = compute_deterministic_path_quality_score(
gaps=gaps,
off_topic_steps=off_topic,
steps=steps,
multistage_qa=multistage_qa,
)
return summary return summary
def compute_deterministic_path_quality_score(
*,
gaps: Sequence[Mapping[str, Any]],
off_topic_steps: Sequence[Mapping[str, Any]],
steps: Optional[Sequence[Mapping[str, Any]]] = None,
multistage_qa: Optional[Mapping[str, Any]] = None,
) -> float:
"""Heuristische Pfad-QS ohne LLM — Basis für Slot-Vergleiche."""
score = 0.92
score -= 0.08 * len(off_topic_steps or [])
score -= 0.05 * len(gaps or [])
if steps:
empty = sum(
1
for s in steps
if isinstance(s, dict)
and s.get("exercise_id") is None
and not s.get("is_ai_proposal")
)
score -= 0.06 * empty
hint_count = int((multistage_qa or {}).get("optimization_hint_count") or 0)
score -= min(0.14, 0.02 * hint_count)
return max(0.35, min(0.98, round(score, 4)))
__all__ = [ __all__ = [
"apply_llm_path_reorder", "apply_llm_path_reorder",
"build_path_qa_summary", "build_path_qa_summary",
"compute_deterministic_path_quality_score",
"detect_off_topic_steps", "detect_off_topic_steps",
"detect_path_gaps", "detect_path_gaps",
"is_roadmap_planned_neighbor_pair", "is_roadmap_planned_neighbor_pair",

View File

@ -0,0 +1,21 @@
"""Deterministische Pfad-QS ohne LLM."""
from planning_exercise_path_qa import compute_deterministic_path_quality_score
def test_deterministic_quality_score_penalizes_off_topic():
base = compute_deterministic_path_quality_score(gaps=[], off_topic_steps=[])
with_off = compute_deterministic_path_quality_score(
gaps=[],
off_topic_steps=[{"roadmap_major_step_index": 1}],
)
assert with_off < base
def test_deterministic_quality_score_penalizes_empty_slots():
base = compute_deterministic_path_quality_score(gaps=[], off_topic_steps=[], steps=[])
with_empty = compute_deterministic_path_quality_score(
gaps=[],
off_topic_steps=[],
steps=[{"exercise_id": None}, {"exercise_id": 1}],
)
assert with_empty < base

View File

@ -500,10 +500,11 @@ export default function ProgressionGraphEditor({ graphId, embedded = false, onSa
unified_slot_review: true, unified_slot_review: true,
baseline_evaluate_steps: slotsToEvaluateSteps(synced), baseline_evaluate_steps: slotsToEvaluateSteps(synced),
include_llm_intent: false, include_llm_intent: false,
include_llm_path_qa: false,
auto_rematch_after_qa: false, auto_rematch_after_qa: false,
}) })
setPathQa(reviewRes?.path_qa || null) const qa = reviewRes?.path_qa || null
setPathQa(qa)
setDraft((prev) => (prev ? { ...prev, lastFindings: qa } : prev))
const compareRes = buildProgressionComparePayload(null, reviewRes) const compareRes = buildProgressionComparePayload(null, reviewRes)
setGapFillOffers(mergeGapOffersForDraft(synced, reviewRes, reviewRes)) setGapFillOffers(mergeGapOffersForDraft(synced, reviewRes, reviewRes))