scripts/wiki_importer.py aktualisiert

2025-08-11 14:08:39 +02:00 · 2025-08-11 14:08:39 +02:00 · 6bab3cdf04
commit 6bab3cdf04
parent 2567d8c786
1 changed files with 150 additions and 88 deletions
--- a/scripts/wiki_importer.py
+++ b/scripts/wiki_importer.py
@ -10,22 +10,18 @@ Beschreibung:
 - Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern
 - Smoke-Test (--smoke-test): 3 Läufe (create → skip → update)
-v2.3.3 – Änderungen ggü. 2.3.2:
+v2.3.4 – Änderungen ggü. 2.3.3:
- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung):
+- **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen
-  • Titel: _ zu Leerzeichen, Gedankenstriche → Bindestrich
+  (z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox").
-  • summary/execution/notes: Whitespace kollabieren
+- **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über
-  • keywords: dedupliziert (case-insensitiv) & sortiert
+  mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf).
-  • duration_minutes: sicher als int
+- Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates.
 - Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload
  (verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint)
 - Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update
 - Kein API-/CLI-Bruch
 """
 import os
 import sys
 import argparse
-from typing import Dict, Any, Tuple, Optional
+from typing import Dict, Any, Tuple, Optional, List
 from collections.abc import Mapping
 import requests
 import mwparserfromhell
@ -33,6 +29,7 @@ from dotenv import load_dotenv
 import hashlib
 import json
 import time
 import unicodedata
 # ----- Konfiguration / Defaults -----
 load_dotenv()  # .env laden, falls vorhanden
@ -43,7 +40,41 @@ DEFAULT_CAT    = os.getenv("WIKI_CATEGORY", "Übungen")
 DEFAULT_TITLE  = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
 REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
-# ---- Hilfsfunktionen für Wiki-Router ----
+# ---- Unicode-/Key-Normalisierung ----
 def _norm_unicode(s: str) -> str:
    return unicodedata.normalize("NFKC", s)
 def _strip_diacritics(s: str) -> str:
    return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
 def _norm_key(s: str) -> str:
    s = _norm_unicode(s or "")
    s = _strip_diacritics(s)
    s = s.strip().casefold()
    return s
 # Template-Aliasse (normalisierte Namen)
 TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"}
 TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"}
 TPL_SKILLDEV = {"skilldevelopment"}
 # Synonyme für Parameter (normalisierte Keys)
 KEYS_SUMMARY       = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
 KEYS_EXECUTION     = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"]
 KEYS_DURATION      = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
 KEYS_KEYWORDS      = ["schlüsselworte", "schluesselworte", "keywords", "tags"]
 KEYS_EQUIPMENT     = ["equipment", "geräte", "geraete", "material"]
 KEYS_DISCIPLINE    = ["übungstyp", "uebungstyp", "discipline"]
 KEYS_GROUP         = ["gruppengröße", "gruppengroesse", "group"]
 KEYS_AGE_GROUP     = ["altersgruppe"]
 KEYS_TARGET_GROUP  = ["zielgruppe", "target_group"]
 KEYS_PURPOSE       = ["ziel", "zweck", "purpose"]
 KEYS_PREPARATION   = ["refmethode", "vorbereitung", "preparation"]
 KEYS_METHOD        = ["method", "methode"]
 KEYS_NOTES         = ["hinweise", "notes"]
 # ---- Hilfsfunktionen ----
 def wiki_health() -> None:
    r = requests.get(f"{API_BASE_URL}/health", timeout=15)
@ -91,30 +122,36 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
    wikicode = mwparserfromhell.parse(wikitext)
    raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
    # Templates sammeln (robust gegen Varianten)
    for tpl in wikicode.filter_templates():
-        name = str(tpl.name).strip()
+        name_raw = str(tpl.name)
-        if name == "ÜbungInfoBox":
+        name_norm = _norm_key(name_raw)
        if name_norm in TPL_UEBUNG_INFOBOX:
            for p in tpl.params:
                raw[str(p.name).strip()] = str(p.value).strip()
-        elif name == "Übungsbeschreibung":
+        elif name_norm in TPL_UEBUNGSBESCHREIBUNG:
            for p in tpl.params:
                raw[str(p.name).strip()] = str(p.value).strip()
-        elif name == "SkillDevelopment":
+        elif name_norm in TPL_SKILLDEV:
            raw.setdefault("capabilities", [])
            # Standard-Keys (engl. Template)
            def _getp(t, k):
                try:
                    return str(t.get(k).value).strip()
                except Exception:
                    return ""
            cap = _getp(tpl, "PrimaryCapability")
            lvl = _getp(tpl, "CapabilityLevel")
            try:
-                cap = str(tpl.get("PrimaryCapability").value).strip()
+                lvl_i = int(lvl)
            except Exception:
-                cap = ""
+                lvl_i = 0
            try:
                lvl = int(str(tpl.get("CapabilityLevel").value).strip())
            except Exception:
                lvl = 0
            if cap:
-                raw["capabilities"].append({"capability": cap, "level": lvl})
+                raw["capabilities"].append({"capability": cap, "level": lvl_i})
    raw["wikitext"] = wikitext
    return raw
 # ---- Fingerprint-Unterstützung (stabil) ----
 def _normalize(v: Any) -> str:
@ -138,22 +175,17 @@ def _norm_text(s: str) -> str:
 def _canon_title(t: str) -> str:
    t = (t or "").strip().replace("_", " ")
    # Gedankenstriche vereinheitlichen
    return t.replace("–", "-").replace("—", "-")
 def compute_fingerprint(payload: Dict[str, Any]) -> str:
    # keywords stabilisieren: trim, dedupe (case-insensitiv), sort
    kws = payload.get("keywords") or []
    kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
    # dauer als int
    dur = payload.get("duration_minutes") or 0
    try:
        dur = int(round(float(dur)))
    except Exception:
        dur = 0
    fields = [
        _canon_title(payload.get("title", "")),
        _norm_text(payload.get("summary", "")),
@ -166,11 +198,25 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
    base = "|".join(_normalize(f) for f in fields)
    return hashlib.sha256(base.encode("utf-8")).hexdigest()
 # ---- Feldauflösung (Synonyme) ----
 def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
    return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) }
 def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
    m = _norm_keymap(d)
    for c in candidates:
        v = m.get(c)
        if v not in (None, ""):
            return v
    return None
 def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
    # Exercise.capabilities erwartet Dict[str,int]
    caps_list = raw.get("capabilities", [])
-    capabilities = {}
+    capabilities: Dict[str, int] = {}
    for c in caps_list:
        cap = c.get("capability")
        lvl = c.get("level")
@ -180,47 +226,63 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
            except Exception:
                pass
-    # Defaults/Fallbacks
+    # Defaults/Fallbacks via Synonyme
-    try:
+    # summary / execution
-        duration = float(raw.get("Dauer", 0) or 0)
+    summary = _get_first(raw, KEYS_SUMMARY) or ""
-    except Exception:
+    execution = _get_first(raw, KEYS_EXECUTION) or ""
        duration = 0.0
-    keywords = []
+    # duration
-    kw_raw = raw.get("Schlüsselworte", "")
+    duration = _get_first(raw, KEYS_DURATION)
    try:
        duration_f = float(duration or 0)
    except Exception:
        duration_f = 0.0
    # keywords
    kw_raw = _get_first(raw, KEYS_KEYWORDS)
    keywords: List[str] = []
    if isinstance(kw_raw, str):
        keywords = [k.strip() for k in kw_raw.split(",") if k.strip()]
-    equipment = []
+    # equipment
-    eq_raw = raw.get("equipment", [])
+    eq_raw = _get_first(raw, KEYS_EQUIPMENT)
    equipment: List[str] = []
    if isinstance(eq_raw, str):
        equipment = [e.strip() for e in eq_raw.split(",") if e.strip()]
    elif isinstance(eq_raw, list):
        equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
-    notes = raw.get("Hinweise", "") or ""
+    notes = _get_first(raw, KEYS_NOTES) or ""
    if mutate:
-        notes = (notes + " [auto-update]").strip()
+        notes = (str(notes) + " [auto-update]").strip()
    discipline   = _get_first(raw, KEYS_DISCIPLINE) or ""
    group        = _get_first(raw, KEYS_GROUP) or None
    age_group    = _get_first(raw, KEYS_AGE_GROUP) or ""
    target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
    purpose      = _get_first(raw, KEYS_PURPOSE) or ""
    preparation  = _get_first(raw, KEYS_PREPARATION) or ""
    method       = _get_first(raw, KEYS_METHOD) or ""
    payload: Dict[str, Any] = {
        "title": raw.get("title") or "",
-        "summary": raw.get("Summary", "") or "",
+        "summary": str(summary) or "",
-        "short_description": raw.get("Summary", "") or "",
+        "short_description": str(summary) or "",
        "keywords": keywords,
        "link": fullurl or "",
-        "discipline": raw.get("Übungstyp", "") or "",
+        "discipline": str(discipline) or "",
-        "group": raw.get("Gruppengröße", "") or None,
+        "group": str(group) if group else None,
-        "age_group": raw.get("Altersgruppe", "") or "",
+        "age_group": str(age_group) or "",
-        "target_group": raw.get("Zielgruppe", "") or "",
+        "target_group": str(target_group) or "",
        "min_participants": 1,
-        "duration_minutes": int(round(duration)),
+        "duration_minutes": int(round(duration_f)),
        "capabilities": capabilities,
        "category": category or "",
-        "purpose": raw.get("Ziel", "") or "",
+        "purpose": str(purpose) or "",
-        "execution": raw.get("Durchführung", "") or "",
+        "execution": str(execution) or "",
-        "notes": notes,
+        "notes": str(notes) or "",
-        "preparation": raw.get("RefMethode", "") or "",
+        "preparation": str(preparation) or "",
-        "method": raw.get("method", "") or "",
+        "method": str(method) or "",
        "equipment": equipment,
        "fullurl": fullurl or "",
        "external_id": f"mw:{raw.get('pageid')}",
@ -229,6 +291,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
    payload["fingerprint"] = compute_fingerprint(payload)
    return payload
 # ---- Lookup/Upsert ----
 def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
    url = f"{EXERCISE_API}/by-external-id"
@ -256,6 +319,37 @@ def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]:
    }
 def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
    keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
    b = {k: before.get(k) for k in keys}
    a = {k: after.get(k)  for k in keys}
    def _kws(x):
        return sorted({(k or "").strip() for k in (x or [])}, key=str.casefold)
    b_norm = {
        "title": _canon_title(b.get("title")),
        "summary": _norm_text(b.get("summary")),
        "execution": _norm_text(b.get("execution")),
        "notes": _norm_text(b.get("notes")),
        "duration_minutes": b.get("duration_minutes"),
        "capabilities": b.get("capabilities"),
        "keywords": _kws(b.get("keywords")),
    }
    a_norm = {
        "title": _canon_title(a.get("title")),
        "summary": _norm_text(a.get("summary")),
        "execution": _norm_text(a.get("execution")),
        "notes": _norm_text(a.get("notes")),
        "duration_minutes": a.get("duration_minutes"),
        "capabilities": a.get("capabilities"),
        "keywords": _kws(a.get("keywords")),
    }
    diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
    if diff:
        print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
    else:
        print("[Diff] (none in hash fields)")
 def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
    title = payload.get("title", "<ohne Titel>")
    ext_id = payload.get("external_id")
@ -280,7 +374,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
            action, reason = "create", "unexpected lookup type"
    if dry_run:
-        print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}")
+        print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}")
        if action == "update":
            _print_diff(found_payload, payload)
        return action
@ -308,44 +402,12 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
                pass
        else:
            resp.raise_for_status()
-            print(f"[Update] '{title}' – {reason} -> OK")
+            print(f"[Update] '{title}' - {reason} -> OK")
            _print_diff(found_payload, payload)
    else:
-        print(f"[Skip]   '{title}' – {reason}")
+        print(f"[Skip]   '{title}' - {reason}")
    return action
 def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
    """Kleines Feld-Diff für die Hash-Felder (Diagnose)."""
    keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
    b = {k: before.get(k) for k in keys}
    a = {k: after.get(k)  for k in keys}
    # für bessere Lesbarkeit normalisieren wir die Textfelder
    b_norm = {
        "title": _canon_title(b.get("title")),
        "summary": _norm_text(b.get("summary")),
        "execution": _norm_text(b.get("execution")),
        "notes": _norm_text(b.get("notes")),
        "duration_minutes": b.get("duration_minutes"),
        "capabilities": b.get("capabilities"),
        "keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold),
    }
    a_norm = {
        "title": _canon_title(a.get("title")),
        "summary": _norm_text(a.get("summary")),
        "execution": _norm_text(a.get("execution")),
        "notes": _norm_text(a.get("notes")),
        "duration_minutes": a.get("duration_minutes"),
        "capabilities": a.get("capabilities"),
        "keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold),
    }
    diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
    if diff:
        print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
    else:
        print("[Diff] (none in hash fields)")
 # ----- Orchestrierung -----
 def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: