shinkan-jinkendo/backend/planning_exercise_text_signals.py

"""
Phase B: Deterministische Text→Katalog-Signale für PlanningTargetProfile.

Mappt Abschnitts-guidance, Rahmen-Ziele/-Notizen und Programmbeschreibung
auf Skill-/Katalog-Gewichte (ohne LLM).
"""
from __future__ import annotations

import re
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple

_MIN_SKILL_NAME_LEN = 3
_MAX_SKILL_MATCHES = 12
_MAX_CATALOG_MATCHES = 6


def _normalize_text_blob(*parts: Optional[str]) -> str:
    chunks: List[str] = []
    for p in parts:
        s = (p or "").strip()
        if s:
            chunks.append(s)
    return "\n".join(chunks).lower()


def _load_skills_for_text_match(cur) -> List[Tuple[int, str, int]]:
    cur.execute(
        """
        SELECT id, name FROM skills
        WHERE (status IS NULL OR status = 'active')
          AND name IS NOT NULL AND TRIM(name) <> ''
        ORDER BY LENGTH(name) DESC, name ASC
        """
    )
    out: List[Tuple[int, str, int]] = []
    for row in cur.fetchall():
        name = str(row.get("name") or "").strip()
        if len(name) < _MIN_SKILL_NAME_LEN:
            continue
        out.append((int(row["id"]), name.lower(), len(name)))
    return out


def _load_catalog_names(cur, table: str, id_col: str = "id", name_col: str = "name") -> List[Tuple[int, str, int]]:
    cur.execute(
        f"""
        SELECT {id_col} AS id, {name_col} AS name
        FROM {table}
        WHERE {name_col} IS NOT NULL AND TRIM({name_col}) <> ''
        ORDER BY LENGTH({name_col}) DESC, {name_col} ASC
        """
    )
    out: List[Tuple[int, str, int]] = []
    for row in cur.fetchall():
        name = str(row.get("name") or "").strip()
        if len(name) < 2:
            continue
        out.append((int(row["id"]), name.lower(), len(name)))
    return out


def _match_catalog_names_in_text(
    text: str,
    catalog_rows: Sequence[Tuple[int, str, int]],
    *,
    weight: float = 0.85,
    limit: int = _MAX_CATALOG_MATCHES,
) -> Dict[int, float]:
    if not text or not catalog_rows:
        return {}
    out: Dict[int, float] = {}
    for cid, name_lower, _ in catalog_rows:
        if len(out) >= limit:
            break
        if len(name_lower) < 2:
            continue
        if name_lower in text:
            out[cid] = max(out.get(cid, 0.0), weight)
    return out


def _match_skills_in_text(
    text: str,
    skill_rows: Sequence[Tuple[int, str, int]],
    *,
    limit: int = _MAX_SKILL_MATCHES,
) -> Dict[int, float]:
    if not text or not skill_rows:
        return {}
    out: Dict[int, float] = {}
    for sid, name_lower, name_len in skill_rows:
        if len(out) >= limit:
            break
        if name_len < _MIN_SKILL_NAME_LEN:
            continue
        if name_lower in text:
            w = min(1.0, 0.72 + min(name_len, 20) * 0.012)
            out[sid] = max(out.get(sid, 0.0), w)
    return out


def load_framework_planning_text_parts(
    cur,
    framework_program_id: int,
    *,
    slot_id: Optional[int] = None,
) -> List[str]:
    """Sammelt Rahmen-Texte für Text-Signal-Matching."""
    parts: List[str] = []
    cur.execute(
        "SELECT description FROM training_framework_programs WHERE id = %s",
        (int(framework_program_id),),
    )
    row = cur.fetchone()
    if row and (row.get("description") or "").strip():
        parts.append(str(row["description"]).strip())

    cur.execute(
        """
        SELECT title, notes FROM training_framework_goals
        WHERE framework_program_id = %s
        ORDER BY sort_order ASC
        """,
        (int(framework_program_id),),
    )
    for g in cur.fetchall():
        t = (g.get("title") or "").strip()
        n = (g.get("notes") or "").strip()
        if t:
            parts.append(t)
        if n:
            parts.append(n)

    if slot_id:
        cur.execute(
            "SELECT title, notes FROM training_framework_slots WHERE id = %s",
            (int(slot_id),),
        )
        srow = cur.fetchone()
        if srow:
            st = (srow.get("title") or "").strip()
            sn = (srow.get("notes") or "").strip()
            if st:
                parts.append(st)
            if sn:
                parts.append(sn)

    return parts


def resolve_planning_text_to_catalog_weights(
    cur,
    text_blob: str,
) -> Tuple[Dict[int, float], Dict[int, float], Dict[int, float], Dict[int, float], Dict[int, float]]:
    """
    Returns: focus, style, training_type, target_group, skill weight maps.
    """
    text = _normalize_text_blob(text_blob)
    if not text or len(text) < 3:
        return {}, {}, {}, {}, {}

    skill_rows = _load_skills_for_text_match(cur)
    focus_rows = _load_catalog_names(cur, "focus_areas")
    style_rows = _load_catalog_names(cur, "style_directions")
    tt_rows = _load_catalog_names(cur, "training_types")
    tg_rows = _load_catalog_names(cur, "target_groups")

    skills = _match_skills_in_text(text, skill_rows)
    focus = _match_catalog_names_in_text(text, focus_rows, weight=0.88)
    style = _match_catalog_names_in_text(text, style_rows, weight=0.82)
    tt = _match_catalog_names_in_text(text, tt_rows, weight=0.82)
    tg = _match_catalog_names_in_text(text, tg_rows, weight=0.8)

    if re.search(r"\bpartner\b|\bpaar\b|\bpaarweise\b|\bzu zweit\b", text):
        for gid, name_lower, _ in tg_rows:
            if "partner" in name_lower or "paar" in name_lower:
                tg[gid] = max(tg.get(gid, 0.0), 0.9)
                break

    return focus, style, tt, tg, skills


def merge_text_signal_summary(
    summary: Mapping[str, Any],
    *,
    text_sources: Sequence[str],
    matched_skills: Sequence[Mapping[str, Any]],
) -> Dict[str, Any]:
    out = dict(summary)
    if text_sources:
        out["text_signal_sources"] = list(text_sources)
    if matched_skills:
        out["text_signal_skills"] = list(matched_skills)[:8]
    return out


__all__ = [
    "load_framework_planning_text_parts",
    "merge_text_signal_summary",
    "resolve_planning_text_to_catalog_weights",
]