diff --git a/scripts/wiki_importer.py b/scripts/wiki_importer.py index 1e5eb7e..3203a03 100644 --- a/scripts/wiki_importer.py +++ b/scripts/wiki_importer.py @@ -10,22 +10,18 @@ Beschreibung: - Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern - Smoke-Test (--smoke-test): 3 Läufe (create → skip → update) -v2.3.3 – Änderungen ggü. 2.3.2: -- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung): - • Titel: _ zu Leerzeichen, Gedankenstriche → Bindestrich - • summary/execution/notes: Whitespace kollabieren - • keywords: dedupliziert (case-insensitiv) & sortiert - • duration_minutes: sicher als int -- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload - (verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint) -- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update -- Kein API-/CLI-Bruch +v2.3.4 – Änderungen ggü. 2.3.3: +- **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen + (z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox"). +- **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über + mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf). +- Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates. """ import os import sys import argparse -from typing import Dict, Any, Tuple, Optional +from typing import Dict, Any, Tuple, Optional, List from collections.abc import Mapping import requests import mwparserfromhell @@ -33,6 +29,7 @@ from dotenv import load_dotenv import hashlib import json import time +import unicodedata # ----- Konfiguration / Defaults ----- load_dotenv() # .env laden, falls vorhanden @@ -43,7 +40,41 @@ DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen") DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60")) -# ---- Hilfsfunktionen für Wiki-Router ---- +# ---- Unicode-/Key-Normalisierung ---- + +def _norm_unicode(s: str) -> str: + return unicodedata.normalize("NFKC", s) + +def _strip_diacritics(s: str) -> str: + return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch)) + +def _norm_key(s: str) -> str: + s = _norm_unicode(s or "") + s = _strip_diacritics(s) + s = s.strip().casefold() + return s + +# Template-Aliasse (normalisierte Namen) +TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"} +TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"} +TPL_SKILLDEV = {"skilldevelopment"} + +# Synonyme für Parameter (normalisierte Keys) +KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"] +KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"] +KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"] +KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "keywords", "tags"] +KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "material"] +KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline"] +KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"] +KEYS_AGE_GROUP = ["altersgruppe"] +KEYS_TARGET_GROUP = ["zielgruppe", "target_group"] +KEYS_PURPOSE = ["ziel", "zweck", "purpose"] +KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"] +KEYS_METHOD = ["method", "methode"] +KEYS_NOTES = ["hinweise", "notes"] + +# ---- Hilfsfunktionen ---- def wiki_health() -> None: r = requests.get(f"{API_BASE_URL}/health", timeout=15) @@ -91,30 +122,36 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: wikicode = mwparserfromhell.parse(wikitext) raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} + + # Templates sammeln (robust gegen Varianten) for tpl in wikicode.filter_templates(): - name = str(tpl.name).strip() - if name == "ÜbungInfoBox": + name_raw = str(tpl.name) + name_norm = _norm_key(name_raw) + if name_norm in TPL_UEBUNG_INFOBOX: for p in tpl.params: raw[str(p.name).strip()] = str(p.value).strip() - elif name == "Übungsbeschreibung": + elif name_norm in TPL_UEBUNGSBESCHREIBUNG: for p in tpl.params: raw[str(p.name).strip()] = str(p.value).strip() - elif name == "SkillDevelopment": + elif name_norm in TPL_SKILLDEV: raw.setdefault("capabilities", []) + # Standard-Keys (engl. Template) + def _getp(t, k): + try: + return str(t.get(k).value).strip() + except Exception: + return "" + cap = _getp(tpl, "PrimaryCapability") + lvl = _getp(tpl, "CapabilityLevel") try: - cap = str(tpl.get("PrimaryCapability").value).strip() + lvl_i = int(lvl) except Exception: - cap = "" - try: - lvl = int(str(tpl.get("CapabilityLevel").value).strip()) - except Exception: - lvl = 0 + lvl_i = 0 if cap: - raw["capabilities"].append({"capability": cap, "level": lvl}) + raw["capabilities"].append({"capability": cap, "level": lvl_i}) raw["wikitext"] = wikitext return raw - # ---- Fingerprint-Unterstützung (stabil) ---- def _normalize(v: Any) -> str: @@ -138,22 +175,17 @@ def _norm_text(s: str) -> str: def _canon_title(t: str) -> str: t = (t or "").strip().replace("_", " ") - # Gedankenstriche vereinheitlichen return t.replace("–", "-").replace("—", "-") def compute_fingerprint(payload: Dict[str, Any]) -> str: - # keywords stabilisieren: trim, dedupe (case-insensitiv), sort kws = payload.get("keywords") or [] kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) - - # dauer als int dur = payload.get("duration_minutes") or 0 try: dur = int(round(float(dur))) except Exception: dur = 0 - fields = [ _canon_title(payload.get("title", "")), _norm_text(payload.get("summary", "")), @@ -166,11 +198,25 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str: base = "|".join(_normalize(f) for f in fields) return hashlib.sha256(base.encode("utf-8")).hexdigest() +# ---- Feldauflösung (Synonyme) ---- + +def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]: + return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) } + + +def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any: + m = _norm_keymap(d) + for c in candidates: + v = m.get(c) + if v not in (None, ""): + return v + return None + def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]: # Exercise.capabilities erwartet Dict[str,int] caps_list = raw.get("capabilities", []) - capabilities = {} + capabilities: Dict[str, int] = {} for c in caps_list: cap = c.get("capability") lvl = c.get("level") @@ -180,47 +226,63 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b except Exception: pass - # Defaults/Fallbacks - try: - duration = float(raw.get("Dauer", 0) or 0) - except Exception: - duration = 0.0 + # Defaults/Fallbacks via Synonyme + # summary / execution + summary = _get_first(raw, KEYS_SUMMARY) or "" + execution = _get_first(raw, KEYS_EXECUTION) or "" - keywords = [] - kw_raw = raw.get("Schlüsselworte", "") + # duration + duration = _get_first(raw, KEYS_DURATION) + try: + duration_f = float(duration or 0) + except Exception: + duration_f = 0.0 + + # keywords + kw_raw = _get_first(raw, KEYS_KEYWORDS) + keywords: List[str] = [] if isinstance(kw_raw, str): keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] - equipment = [] - eq_raw = raw.get("equipment", []) + # equipment + eq_raw = _get_first(raw, KEYS_EQUIPMENT) + equipment: List[str] = [] if isinstance(eq_raw, str): equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] elif isinstance(eq_raw, list): equipment = [str(e).strip() for e in eq_raw if str(e).strip()] - notes = raw.get("Hinweise", "") or "" + notes = _get_first(raw, KEYS_NOTES) or "" if mutate: - notes = (notes + " [auto-update]").strip() + notes = (str(notes) + " [auto-update]").strip() + + discipline = _get_first(raw, KEYS_DISCIPLINE) or "" + group = _get_first(raw, KEYS_GROUP) or None + age_group = _get_first(raw, KEYS_AGE_GROUP) or "" + target_group = _get_first(raw, KEYS_TARGET_GROUP) or "" + purpose = _get_first(raw, KEYS_PURPOSE) or "" + preparation = _get_first(raw, KEYS_PREPARATION) or "" + method = _get_first(raw, KEYS_METHOD) or "" payload: Dict[str, Any] = { "title": raw.get("title") or "", - "summary": raw.get("Summary", "") or "", - "short_description": raw.get("Summary", "") or "", + "summary": str(summary) or "", + "short_description": str(summary) or "", "keywords": keywords, "link": fullurl or "", - "discipline": raw.get("Übungstyp", "") or "", - "group": raw.get("Gruppengröße", "") or None, - "age_group": raw.get("Altersgruppe", "") or "", - "target_group": raw.get("Zielgruppe", "") or "", + "discipline": str(discipline) or "", + "group": str(group) if group else None, + "age_group": str(age_group) or "", + "target_group": str(target_group) or "", "min_participants": 1, - "duration_minutes": int(round(duration)), + "duration_minutes": int(round(duration_f)), "capabilities": capabilities, "category": category or "", - "purpose": raw.get("Ziel", "") or "", - "execution": raw.get("Durchführung", "") or "", - "notes": notes, - "preparation": raw.get("RefMethode", "") or "", - "method": raw.get("method", "") or "", + "purpose": str(purpose) or "", + "execution": str(execution) or "", + "notes": str(notes) or "", + "preparation": str(preparation) or "", + "method": str(method) or "", "equipment": equipment, "fullurl": fullurl or "", "external_id": f"mw:{raw.get('pageid')}", @@ -229,6 +291,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b payload["fingerprint"] = compute_fingerprint(payload) return payload +# ---- Lookup/Upsert ---- def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]: url = f"{EXERCISE_API}/by-external-id" @@ -256,6 +319,37 @@ def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]: } +def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None: + keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"] + b = {k: before.get(k) for k in keys} + a = {k: after.get(k) for k in keys} + def _kws(x): + return sorted({(k or "").strip() for k in (x or [])}, key=str.casefold) + b_norm = { + "title": _canon_title(b.get("title")), + "summary": _norm_text(b.get("summary")), + "execution": _norm_text(b.get("execution")), + "notes": _norm_text(b.get("notes")), + "duration_minutes": b.get("duration_minutes"), + "capabilities": b.get("capabilities"), + "keywords": _kws(b.get("keywords")), + } + a_norm = { + "title": _canon_title(a.get("title")), + "summary": _norm_text(a.get("summary")), + "execution": _norm_text(a.get("execution")), + "notes": _norm_text(a.get("notes")), + "duration_minutes": a.get("duration_minutes"), + "capabilities": a.get("capabilities"), + "keywords": _kws(a.get("keywords")), + } + diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)} + if diff: + print("[Diff] changes:", json.dumps(diff, ensure_ascii=False)) + else: + print("[Diff] (none in hash fields)") + + def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: title = payload.get("title", "") ext_id = payload.get("external_id") @@ -280,7 +374,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: action, reason = "create", "unexpected lookup type" if dry_run: - print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}") + print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}") if action == "update": _print_diff(found_payload, payload) return action @@ -308,44 +402,12 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: pass else: resp.raise_for_status() - print(f"[Update] '{title}' – {reason} -> OK") + print(f"[Update] '{title}' - {reason} -> OK") _print_diff(found_payload, payload) else: - print(f"[Skip] '{title}' – {reason}") + print(f"[Skip] '{title}' - {reason}") return action - -def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None: - """Kleines Feld-Diff für die Hash-Felder (Diagnose).""" - keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"] - b = {k: before.get(k) for k in keys} - a = {k: after.get(k) for k in keys} - # für bessere Lesbarkeit normalisieren wir die Textfelder - b_norm = { - "title": _canon_title(b.get("title")), - "summary": _norm_text(b.get("summary")), - "execution": _norm_text(b.get("execution")), - "notes": _norm_text(b.get("notes")), - "duration_minutes": b.get("duration_minutes"), - "capabilities": b.get("capabilities"), - "keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold), - } - a_norm = { - "title": _canon_title(a.get("title")), - "summary": _norm_text(a.get("summary")), - "execution": _norm_text(a.get("execution")), - "notes": _norm_text(a.get("notes")), - "duration_minutes": a.get("duration_minutes"), - "capabilities": a.get("capabilities"), - "keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold), - } - diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)} - if diff: - print("[Diff] changes:", json.dumps(diff, ensure_ascii=False)) - else: - print("[Diff] (none in hash fields)") - - # ----- Orchestrierung ----- def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: