scripts/wiki_importer.py aktualisiert

2025-08-11 14:08:39 +02:00 · 2025-08-11 14:08:39 +02:00 · 6bab3cdf04
commit 6bab3cdf04
parent 2567d8c786
1 changed files with 150 additions and 88 deletions
--- a/scripts/wiki_importer.py
+++ b/scripts/wiki_importer.py
@ -10,22 +10,18 @@ Beschreibung:
 - Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern
 - Smoke-Test (--smoke-test): 3 Läufe (create → skip → update)

-v2.3.3 – Änderungen ggü. 2.3.2:
- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung):
-  • Titel: _ zu Leerzeichen, Gedankenstriche → Bindestrich
-  • summary/execution/notes: Whitespace kollabieren
-  • keywords: dedupliziert (case-insensitiv) & sortiert
-  • duration_minutes: sicher als int
- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload
-  (verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint)
- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update
- Kein API-/CLI-Bruch
+v2.3.4 – Änderungen ggü. 2.3.3:
+- **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen
+  (z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox").
+- **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über
+  mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf).
+- Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates.
 """

 import os
 import sys
 import argparse
-from typing import Dict, Any, Tuple, Optional
+from typing import Dict, Any, Tuple, Optional, List
 from collections.abc import Mapping
 import requests
 import mwparserfromhell
@ -33,6 +29,7 @@ from dotenv import load_dotenv
 import hashlib
 import json
 import time
+import unicodedata

 # ----- Konfiguration / Defaults -----
 load_dotenv()  # .env laden, falls vorhanden
@ -43,7 +40,41 @@ DEFAULT_CAT    = os.getenv("WIKI_CATEGORY", "Übungen")
 DEFAULT_TITLE  = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
 REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))

-# ---- Hilfsfunktionen für Wiki-Router ----
+# ---- Unicode-/Key-Normalisierung ----
+
+def _norm_unicode(s: str) -> str:
+    return unicodedata.normalize("NFKC", s)
+
+def _strip_diacritics(s: str) -> str:
+    return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
+
+def _norm_key(s: str) -> str:
+    s = _norm_unicode(s or "")
+    s = _strip_diacritics(s)
+    s = s.strip().casefold()
+    return s
+
+# Template-Aliasse (normalisierte Namen)
+TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"}
+TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"}
+TPL_SKILLDEV = {"skilldevelopment"}
+
+# Synonyme für Parameter (normalisierte Keys)
+KEYS_SUMMARY       = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
+KEYS_EXECUTION     = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"]
+KEYS_DURATION      = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
+KEYS_KEYWORDS      = ["schlüsselworte", "schluesselworte", "keywords", "tags"]
+KEYS_EQUIPMENT     = ["equipment", "geräte", "geraete", "material"]
+KEYS_DISCIPLINE    = ["übungstyp", "uebungstyp", "discipline"]
+KEYS_GROUP         = ["gruppengröße", "gruppengroesse", "group"]
+KEYS_AGE_GROUP     = ["altersgruppe"]
+KEYS_TARGET_GROUP  = ["zielgruppe", "target_group"]
+KEYS_PURPOSE       = ["ziel", "zweck", "purpose"]
+KEYS_PREPARATION   = ["refmethode", "vorbereitung", "preparation"]
+KEYS_METHOD        = ["method", "methode"]
+KEYS_NOTES         = ["hinweise", "notes"]
+
+# ---- Hilfsfunktionen ----

 def wiki_health() -> None:
    r = requests.get(f"{API_BASE_URL}/health", timeout=15)
@ -91,30 +122,36 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
    wikicode = mwparserfromhell.parse(wikitext)

    raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
+
+    # Templates sammeln (robust gegen Varianten)
    for tpl in wikicode.filter_templates():
-        name = str(tpl.name).strip()
-        if name == "ÜbungInfoBox":
+        name_raw = str(tpl.name)
+        name_norm = _norm_key(name_raw)
+        if name_norm in TPL_UEBUNG_INFOBOX:
            for p in tpl.params:
                raw[str(p.name).strip()] = str(p.value).strip()
-        elif name == "Übungsbeschreibung":
+        elif name_norm in TPL_UEBUNGSBESCHREIBUNG:
            for p in tpl.params:
                raw[str(p.name).strip()] = str(p.value).strip()
-        elif name == "SkillDevelopment":
+        elif name_norm in TPL_SKILLDEV:
            raw.setdefault("capabilities", [])
+            # Standard-Keys (engl. Template)
+            def _getp(t, k):
+                try:
+                    return str(t.get(k).value).strip()
+                except Exception:
+                    return ""
+            cap = _getp(tpl, "PrimaryCapability")
+            lvl = _getp(tpl, "CapabilityLevel")
            try:
-                cap = str(tpl.get("PrimaryCapability").value).strip()
+                lvl_i = int(lvl)
            except Exception:
-                cap = ""
-            try:
-                lvl = int(str(tpl.get("CapabilityLevel").value).strip())
-            except Exception:
-                lvl = 0
+                lvl_i = 0
            if cap:
-                raw["capabilities"].append({"capability": cap, "level": lvl})
+                raw["capabilities"].append({"capability": cap, "level": lvl_i})
    raw["wikitext"] = wikitext
    return raw

-
 # ---- Fingerprint-Unterstützung (stabil) ----

 def _normalize(v: Any) -> str:
@ -138,22 +175,17 @@ def _norm_text(s: str) -> str:

 def _canon_title(t: str) -> str:
    t = (t or "").strip().replace("_", " ")
-    # Gedankenstriche vereinheitlichen
    return t.replace("–", "-").replace("—", "-")


 def compute_fingerprint(payload: Dict[str, Any]) -> str:
-    # keywords stabilisieren: trim, dedupe (case-insensitiv), sort
    kws = payload.get("keywords") or []
    kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
-
-    # dauer als int
    dur = payload.get("duration_minutes") or 0
    try:
        dur = int(round(float(dur)))
    except Exception:
        dur = 0
-
    fields = [
        _canon_title(payload.get("title", "")),
        _norm_text(payload.get("summary", "")),
@ -166,11 +198,25 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
    base = "|".join(_normalize(f) for f in fields)
    return hashlib.sha256(base.encode("utf-8")).hexdigest()

+# ---- Feldauflösung (Synonyme) ----
+
+def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
+    return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) }
+
+
+def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
+    m = _norm_keymap(d)
+    for c in candidates:
+        v = m.get(c)
+        if v not in (None, ""):
+            return v
+    return None
+

 def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
    # Exercise.capabilities erwartet Dict[str,int]
    caps_list = raw.get("capabilities", [])
-    capabilities = {}
+    capabilities: Dict[str, int] = {}
    for c in caps_list:
        cap = c.get("capability")
        lvl = c.get("level")
@ -180,47 +226,63 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
            except Exception:
                pass

-    # Defaults/Fallbacks
-    try:
-        duration = float(raw.get("Dauer", 0) or 0)
-    except Exception:
-        duration = 0.0
+    # Defaults/Fallbacks via Synonyme
+    # summary / execution
+    summary = _get_first(raw, KEYS_SUMMARY) or ""
+    execution = _get_first(raw, KEYS_EXECUTION) or ""

-    keywords = []
-    kw_raw = raw.get("Schlüsselworte", "")
+    # duration
+    duration = _get_first(raw, KEYS_DURATION)
+    try:
+        duration_f = float(duration or 0)
+    except Exception:
+        duration_f = 0.0
+
+    # keywords
+    kw_raw = _get_first(raw, KEYS_KEYWORDS)
+    keywords: List[str] = []
    if isinstance(kw_raw, str):
        keywords = [k.strip() for k in kw_raw.split(",") if k.strip()]

-    equipment = []
-    eq_raw = raw.get("equipment", [])
+    # equipment
+    eq_raw = _get_first(raw, KEYS_EQUIPMENT)
+    equipment: List[str] = []
    if isinstance(eq_raw, str):
        equipment = [e.strip() for e in eq_raw.split(",") if e.strip()]
    elif isinstance(eq_raw, list):
        equipment = [str(e).strip() for e in eq_raw if str(e).strip()]

-    notes = raw.get("Hinweise", "") or ""
+    notes = _get_first(raw, KEYS_NOTES) or ""
    if mutate:
-        notes = (notes + " [auto-update]").strip()
+        notes = (str(notes) + " [auto-update]").strip()
+
+    discipline   = _get_first(raw, KEYS_DISCIPLINE) or ""
+    group        = _get_first(raw, KEYS_GROUP) or None
+    age_group    = _get_first(raw, KEYS_AGE_GROUP) or ""
+    target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
+    purpose      = _get_first(raw, KEYS_PURPOSE) or ""
+    preparation  = _get_first(raw, KEYS_PREPARATION) or ""
+    method       = _get_first(raw, KEYS_METHOD) or ""

    payload: Dict[str, Any] = {
        "title": raw.get("title") or "",
-        "summary": raw.get("Summary", "") or "",
-        "short_description": raw.get("Summary", "") or "",
+        "summary": str(summary) or "",
+        "short_description": str(summary) or "",
        "keywords": keywords,
        "link": fullurl or "",
-        "discipline": raw.get("Übungstyp", "") or "",
-        "group": raw.get("Gruppengröße", "") or None,
-        "age_group": raw.get("Altersgruppe", "") or "",
-        "target_group": raw.get("Zielgruppe", "") or "",
+        "discipline": str(discipline) or "",
+        "group": str(group) if group else None,
+        "age_group": str(age_group) or "",
+        "target_group": str(target_group) or "",
        "min_participants": 1,
-        "duration_minutes": int(round(duration)),
+        "duration_minutes": int(round(duration_f)),
        "capabilities": capabilities,
        "category": category or "",
-        "purpose": raw.get("Ziel", "") or "",
-        "execution": raw.get("Durchführung", "") or "",
-        "notes": notes,
-        "preparation": raw.get("RefMethode", "") or "",
-        "method": raw.get("method", "") or "",
+        "purpose": str(purpose) or "",
+        "execution": str(execution) or "",
+        "notes": str(notes) or "",
+        "preparation": str(preparation) or "",
+        "method": str(method) or "",
        "equipment": equipment,
        "fullurl": fullurl or "",
        "external_id": f"mw:{raw.get('pageid')}",
@ -229,6 +291,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
    payload["fingerprint"] = compute_fingerprint(payload)
    return payload

+# ---- Lookup/Upsert ----

 def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
    url = f"{EXERCISE_API}/by-external-id"
@ -256,6 +319,37 @@ def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]:
    }


+def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
+    keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
+    b = {k: before.get(k) for k in keys}
+    a = {k: after.get(k)  for k in keys}
+    def _kws(x):
+        return sorted({(k or "").strip() for k in (x or [])}, key=str.casefold)
+    b_norm = {
+        "title": _canon_title(b.get("title")),
+        "summary": _norm_text(b.get("summary")),
+        "execution": _norm_text(b.get("execution")),
+        "notes": _norm_text(b.get("notes")),
+        "duration_minutes": b.get("duration_minutes"),
+        "capabilities": b.get("capabilities"),
+        "keywords": _kws(b.get("keywords")),
+    }
+    a_norm = {
+        "title": _canon_title(a.get("title")),
+        "summary": _norm_text(a.get("summary")),
+        "execution": _norm_text(a.get("execution")),
+        "notes": _norm_text(a.get("notes")),
+        "duration_minutes": a.get("duration_minutes"),
+        "capabilities": a.get("capabilities"),
+        "keywords": _kws(a.get("keywords")),
+    }
+    diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
+    if diff:
+        print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
+    else:
+        print("[Diff] (none in hash fields)")
+
+
 def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
    title = payload.get("title", "<ohne Titel>")
    ext_id = payload.get("external_id")
@ -280,7 +374,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
            action, reason = "create", "unexpected lookup type"

    if dry_run:
-        print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}")
+        print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}")
        if action == "update":
            _print_diff(found_payload, payload)
        return action
@ -308,44 +402,12 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
                pass
        else:
            resp.raise_for_status()
-            print(f"[Update] '{title}' – {reason} -> OK")
+            print(f"[Update] '{title}' - {reason} -> OK")
            _print_diff(found_payload, payload)
    else:
-        print(f"[Skip]   '{title}' – {reason}")
+        print(f"[Skip]   '{title}' - {reason}")
    return action

-
-def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
-    """Kleines Feld-Diff für die Hash-Felder (Diagnose)."""
-    keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
-    b = {k: before.get(k) for k in keys}
-    a = {k: after.get(k)  for k in keys}
-    # für bessere Lesbarkeit normalisieren wir die Textfelder
-    b_norm = {
-        "title": _canon_title(b.get("title")),
-        "summary": _norm_text(b.get("summary")),
-        "execution": _norm_text(b.get("execution")),
-        "notes": _norm_text(b.get("notes")),
-        "duration_minutes": b.get("duration_minutes"),
-        "capabilities": b.get("capabilities"),
-        "keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold),
-    }
-    a_norm = {
-        "title": _canon_title(a.get("title")),
-        "summary": _norm_text(a.get("summary")),
-        "execution": _norm_text(a.get("execution")),
-        "notes": _norm_text(a.get("notes")),
-        "duration_minutes": a.get("duration_minutes"),
-        "capabilities": a.get("capabilities"),
-        "keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold),
-    }
-    diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
-    if diff:
-        print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
-    else:
-        print("[Diff] (none in hash fields)")
-
-
 # ----- Orchestrierung -----

 def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: