scripts/wiki_importer.py aktualisiert

2025-08-11 15:40:41 +02:00 · 2025-08-11 15:40:41 +02:00 · 7b383f0778
commit 7b383f0778
parent 34320b46d9
1 changed files with 141 additions and 218 deletions
--- a/scripts/wiki_importer.py
+++ b/scripts/wiki_importer.py
@ -1,30 +1,36 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-wiki_importer.py – v2.3.7
+wiki_importer.py – v2.3.8

-Fixes ggü. v2.3.6:
- **Keywords/EQUIPMENT/DISCIPLINE u.a. wurden teils nicht erkannt**: Bugfix in `_get_first()` –
-  Kandidatenschlüssel werden jetzt ebenfalls normalisiert (`_norm_key(c)`), damit
-  `Schlüsselworte` (aus dem Wiki) zuverlässig matcht.
- `_get_first_fuzzy()` normalisiert die Such-Tokens.
- Kleine Bugfixes/Polish: `action.upper()` im Dry-Run, sanftere Keywords-Splittung.
+Ziel dieses Patches: Die Felder `discipline`, `execution`, `keywords`, `equipment`, `duration_minutes` usw.
+kommen bei dir teilweise leer an. Ursache sind zu aggressive Normalisierungen/Matcher.

-Hinweis: Keine API-/CLI-Änderungen. Parser unterstützt weiterhin `{{Hilfsmittel}}`.
+Fix (konservativ & robust):
+- Parser liest jetzt **gezielt** die bekannten Templates **ohne** Over-Normalisierung:
+  • `{{ÜbungInfoBox}}` / `{{UebungInfoBox}}`
+  • `{{Übungsbeschreibung}}` / `{{Uebungsbeschreibung}}`
+  • `{{Hilfsmittel}}`
+  • `{{SkillDevelopment}}`
+- Feld-Extraktion nutzt **zuerst die exakten Wiki-Parameternamen** (deutsch/mit Umlauten),
+  erst danach schmale Synonym-Fallbacks. Das stellt sicher, dass z. B. `Schlüsselworte=`
+  wirklich in `keywords` landet.
+- `imported_at` wird bei **Create und Update** gesetzt.
+- Optionales Debugging: `--debug-raw` druckt die gefundenen Raw-Keys (einfach, nachvollziehbar).
+
+Bestehende API-Endpunkte bleiben unverändert.
 """

 import os
 import sys
 import argparse
 from typing import Dict, Any, Tuple, Optional, List
-from collections.abc import Mapping
 import requests
 import mwparserfromhell
 from dotenv import load_dotenv
 import hashlib
 import json
 import time
-import unicodedata

 # ----- Konfiguration / Defaults -----
 load_dotenv()
@ -34,52 +40,14 @@ DEFAULT_CAT     = os.getenv("WIKI_CATEGORY", "Übungen")
 DEFAULT_TITLE   = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
 REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))

-# ---- Unicode-/Key-Normalisierung ----
-
-def _norm_unicode(s: str) -> str:
-    return unicodedata.normalize("NFKC", s)
-
-def _strip_diacritics(s: str) -> str:
-    return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
-
-def _norm_key(s: str) -> str:
-    s = _norm_unicode(s or "")
-    s = _strip_diacritics(s)
-    s = s.strip().casefold()
-    return s
-
-def _norm_tpl(s: str) -> str:
-    s = _norm_key(s)
-    return "".join(ch for ch in s if ch.isalnum())
-
-# Template-Aliasse (normalisierte Namen, _norm_tpl!)
-TPL_UEBUNG_INFOBOX      = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"}
-TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"}
-TPL_SKILLDEV            = {"skilldevelopment"}
-TPL_HILFSMITTEL         = {"hilfsmittel"}
-
-# Synonyme (werden im Code nochmals normalisiert)
-KEYS_SUMMARY       = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
-KEYS_EXECUTION     = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"]
-KEYS_DURATION      = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
-KEYS_KEYWORDS      = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags", "schluesselwort", "schlüsselwort"]
-KEYS_EQUIPMENT     = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"]
-KEYS_DISCIPLINE    = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"]
-KEYS_GROUP         = ["gruppengröße", "gruppengroesse", "group"]
-KEYS_AGE_GROUP     = ["altersgruppe"]
-KEYS_TARGET_GROUP  = ["zielgruppe", "target_group"]
-KEYS_PURPOSE       = ["ziel", "zweck", "purpose"]
-KEYS_PREPARATION   = ["refmethode", "vorbereitung", "preparation"]
-KEYS_METHOD        = ["method", "methode"]
-KEYS_NOTES         = ["hinweise", "notes"]
-
-# ---- Wiki-Router Helpers ----
+# ----- Helpers für Wiki-Router -----

 def wiki_health() -> None:
    r = requests.get(f"{API_BASE_URL}/health", timeout=15)
    r.raise_for_status()
    print("[Sanity] Wiki health OK")

+
 def wiki_login(username: str, password: str) -> None:
    payload = {"username": username, "password": password}
    r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30)
@ -94,38 +62,48 @@ def wiki_login(username: str, password: str) -> None:
        raise RuntimeError(f"[Login] {msg}")
    print("[Login] success")

+
 def fetch_all_pages(category: str) -> Dict[str, Any]:
    resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()
    return resp.json()

+
 def fetch_page_info(title: str) -> Dict[str, Any]:
    r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30)
    r.raise_for_status()
    info = r.json()
    return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}

-# ---- Parser ----
+# ----- Parser (konservativ) -----
+
+T_INFOS    = {"ÜbungInfoBox", "UebungInfoBox"}
+T_BESCHR   = {"Übungsbeschreibung", "Uebungsbeschreibung"}
+T_HILFS    = {"Hilfsmittel"}
+T_SKILL    = {"SkillDevelopment"}
+

 def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
    print(f"[Parse] Lade '{title}' (ID={pageid})")
    resp = requests.get(
        f"{API_BASE_URL}/parsepage",
        params={"pageid": pageid, "title": title},
-        timeout=REQUEST_TIMEOUT
+        timeout=REQUEST_TIMEOUT,
    )
    resp.raise_for_status()
    wikitext = resp.json().get("wikitext", "")
    wikicode = mwparserfromhell.parse(wikitext)

-    raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
+    raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid, "wikitext": wikitext}

    for tpl in wikicode.filter_templates():
-        name_norm = _norm_tpl(str(tpl.name))
-        if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG:
+        name = str(tpl.name).strip()
+        if name in T_INFOS or name in T_BESCHR or name in T_HILFS:
            for p in tpl.params:
-                raw[str(p.name).strip()] = str(p.value).strip()
-        elif name_norm in TPL_SKILLDEV:
+                key = str(p.name).strip()
+                val = str(p.value).strip()
+                raw[key] = val
+        elif name in T_SKILL:
            raw.setdefault("capabilities", [])
            def _getp(t, k):
                try:
@ -140,14 +118,10 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
                lvl_i = 0
            if cap:
                raw["capabilities"].append({"capability": cap, "level": lvl_i})
-        elif name_norm in TPL_HILFSMITTEL:
-            for p in tpl.params:
-                raw[str(p.name).strip()] = str(p.value).strip()

-    raw["wikitext"] = wikitext
    return raw

-# ---- Fingerprint (stabil) ----
+# ----- Fingerprint (stabil, wie zuvor) -----

 def _normalize(v: Any) -> str:
    if v is None:
@ -158,21 +132,22 @@ def _normalize(v: Any) -> str:
        return json.dumps(v, sort_keys=True, ensure_ascii=False)
    return str(v).strip()

+
 def _norm_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s).replace("\u00a0", " ")
-    s = s.strip()
    s = " ".join(s.split())
-    return s
+    return s.strip()
+

 def _canon_title(t: str) -> str:
    t = (t or "").strip().replace("_", " ")
    return t.replace("–", "-").replace("—", "-")

+
 def compute_fingerprint(payload: Dict[str, Any]) -> str:
    kws = payload.get("keywords") or []
-    # Strichvarianten normalisieren
    kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws]
    kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
    dur = payload.get("duration_minutes") or 0
@ -192,111 +167,96 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
    base = "|".join(_normalize(f) for f in fields)
    return hashlib.sha256(base.encode("utf-8")).hexdigest()

-# ---- Feldauflösung (Synonyme + Fuzzy) ----
+# ----- Payload (exakte DE-Keys zuerst, dann schmale Fallbacks) -----

-def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
-    return {_norm_key(k): v for k, v in d.items() if isinstance(k, str)}
+EXACT_KEYS = {
+    "summary": ["Summary", "Kurzbeschreibung"],
+    "execution": ["Durchführung", "Durchfuehrung", "Ablauf"],
+    "duration": ["Dauer", "Zeit"],
+    "keywords": ["Schlüsselworte", "Schlüsselwörter", "Schluesselworte", "Schluesselwoerter", "Keywords", "Tags"],
+    "equipment_prim": ["Hilfsmittel"],
+    "equipment_alt": ["Geräte", "Geraete", "Gerät", "Geraet", "Material"],
+    "discipline": ["Übungstyp", "Uebungstyp", "Disziplin"],
+    "group": ["Gruppengröße", "Gruppengroesse", "Group"],
+    "age_group": ["Altersgruppe"],
+    "target_group": ["Zielgruppe"],
+    "purpose": ["Ziel", "Zweck"],
+    "notes": ["Hinweise", "Notes"],
+    "preparation": ["Vorbereitung", "RefMethode"],
+    "method": ["Methode", "Method"],
+}

-def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
-    m = _norm_keymap(d)
-    for c in candidates:
-        v = m.get(_norm_key(c))  # << Bugfix: Kandidaten ebenfalls normalisieren
-        if v not in (None, ""):
-            return v
+
+def _first_any(raw: Dict[str, Any], keys: List[str]) -> Optional[str]:
+    for k in keys:
+        v = raw.get(k)
+        if isinstance(v, str) and v.strip():
+            return v.strip()
    return None

-def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any:
-    m = _norm_keymap(d)
-    toks = [_norm_key(t) for t in tokens]
-    for k, v in m.items():
-        if v in (None, ""):
-            continue
-        if all(t in k for t in toks):
-            return v
-    return None
-
-# ---- Payload ----

 def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
    # Capabilities -> Dict[str,int]
-    caps_list = raw.get("capabilities", [])
    capabilities: Dict[str, int] = {}
-    for c in caps_list:
-        cap = c.get("capability")
-        lvl = c.get("level")
+    for c in raw.get("capabilities", []) or []:
+        cap = c.get("capability"); lvl = c.get("level")
        if isinstance(cap, str) and cap:
            try:
                capabilities[cap] = int(lvl)
            except Exception:
                pass

-    # summary / execution
-    summary = _get_first(raw, KEYS_SUMMARY) or ""
-    execution = _get_first(raw, KEYS_EXECUTION)
-    if execution in (None, ""):
-        execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf"]) or ""
+    # Exakte Schlüssel zuerst
+    summary   = _first_any(raw, EXACT_KEYS["summary"])   or ""
+    execution = _first_any(raw, EXACT_KEYS["execution"]) or ""
+    duration  = _first_any(raw, EXACT_KEYS["duration"])  or "0"
+
+    kw_raw = _first_any(raw, EXACT_KEYS["keywords"]) or ""
+    if kw_raw:
+        parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
+        keywords = [p for p in parts if p]
+    else:
+        keywords = []
+
+    eq_raw = _first_any(raw, EXACT_KEYS["equipment_prim"]) or _first_any(raw, EXACT_KEYS["equipment_alt"]) or ""
+    if eq_raw:
+        equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
+    else:
+        equipment = []
+
+    notes        = _first_any(raw, EXACT_KEYS["notes"])        or ""
+    discipline   = _first_any(raw, EXACT_KEYS["discipline"])   or ""
+    group        = _first_any(raw, EXACT_KEYS["group"])        or None
+    age_group    = _first_any(raw, EXACT_KEYS["age_group"])    or ""
+    target_group = _first_any(raw, EXACT_KEYS["target_group"]) or ""
+    purpose      = _first_any(raw, EXACT_KEYS["purpose"])      or ""
+    preparation  = _first_any(raw, EXACT_KEYS["preparation"])  or ""
+    method       = _first_any(raw, EXACT_KEYS["method"])       or ""

-    # duration
-    duration = _get_first(raw, KEYS_DURATION)
    try:
        duration_f = float(duration or 0)
    except Exception:
        duration_f = 0.0

-    # keywords
-    kw_raw = _get_first(raw, KEYS_KEYWORDS)
-    if kw_raw in (None, ""):
-        kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or ""
-    keywords: List[str] = []
-    if isinstance(kw_raw, str):
-        # robuste Auftrennung; ignoriert doppelte Kommas/Zeilenumbrüche
-        parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
-        keywords = [p for p in parts if p]
-
-    # equipment
-    eq_raw = _get_first(raw, KEYS_EQUIPMENT)
-    if eq_raw in (None, ""):
-        eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or ""
-    equipment: List[str] = []
-    if isinstance(eq_raw, str):
-        equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
-    elif isinstance(eq_raw, list):
-        equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
-
-    notes = _get_first(raw, KEYS_NOTES) or ""
-    if mutate:
-        notes = (str(notes) + " [auto-update]").strip()
-
-    discipline   = _get_first(raw, KEYS_DISCIPLINE) or ""
-    if discipline in (None, ""):
-        discipline = _get_first_fuzzy(raw, ["ubung", "typ"]) or _get_first_fuzzy(raw, ["schwerpunkt"]) or ""
-
-    group        = _get_first(raw, KEYS_GROUP) or None
-    age_group    = _get_first(raw, KEYS_AGE_GROUP) or ""
-    target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
-    purpose      = _get_first(raw, KEYS_PURPOSE) or ""
-    preparation  = _get_first(raw, KEYS_PREPARATION) or ""
-    method       = _get_first(raw, KEYS_METHOD) or ""
-
    payload: Dict[str, Any] = {
        "title": raw.get("title") or "",
-        "summary": str(summary) or "",
-        "short_description": str(summary) or "",
+        "summary": summary,
+        "short_description": summary,
        "keywords": keywords,
        "link": fullurl or "",
-        "discipline": str(discipline) or "",
-        "group": str(group) if group else None,
-        "age_group": str(age_group) or "",
-        "target_group": str(target_group) or "",
+        "discipline": discipline,
+        "group": group,
+        "age_group": age_group,
+        "target_group": target_group,
        "min_participants": 1,
        "duration_minutes": int(round(duration_f)),
        "capabilities": capabilities,
        "category": category or "",
-        "purpose": str(purpose) or "",
-        "execution": str(execution) or "",
-        "notes": str(notes) or "",
-        "preparation": str(preparation) or "",
-        "method": str(method) or "",
+        "purpose": purpose,
+        "execution": execution,
+        "notes": (notes + (" [auto-update]" if mutate else "")).strip(),
+        "preparation": preparation,
+        "method": method,
        "equipment": equipment,
        "fullurl": fullurl or "",
        "external_id": f"mw:{raw.get('pageid')}",
@ -305,7 +265,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
    payload["fingerprint"] = compute_fingerprint(payload)
    return payload

-# ---- Lookup/Upsert ----
+# ----- Lookup/Upsert -----

 def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
    url = f"{EXERCISE_API}/by-external-id"
@ -375,9 +335,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:

    found, status = lookup_by_external_id(ext_id)

-    action = "create"
-    reason = "not found (lookup 404)"
-    found_payload = {}
+    action = "create"; reason = "not found (lookup 404)"; found_payload = {}

    if not (status == 404 or found is None):
        if isinstance(found, dict):
@ -398,112 +356,78 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
        return action

    if action == "create":
-        payload2 = dict(payload)
-        payload2["imported_at"] = _now_iso()
-        resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
+        body = dict(payload); body["imported_at"] = _now_iso()
+        resp = requests.post(EXERCISE_API, json=body, timeout=REQUEST_TIMEOUT)
        if resp.status_code == 422:
            print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}")
-            try:
-                resp.raise_for_status()
-            except Exception:
-                pass
+            try: resp.raise_for_status()
+            except Exception: pass
        else:
-            resp.raise_for_status()
-            print(f"[Create] '{title}' – {reason} -> OK")
+            resp.raise_for_status(); print(f"[Create] '{title}' – {reason} -> OK")
    elif action == "update":
-        payload2 = dict(payload)
-        payload2["imported_at"] = _now_iso()
-        resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
+        body = dict(payload); body["imported_at"] = _now_iso()
+        resp = requests.post(EXERCISE_API, json=body, timeout=REQUEST_TIMEOUT)
        if resp.status_code == 422:
            print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")
-            try:
-                resp.raise_for_status()
-            except Exception:
-                pass
+            try: resp.raise_for_status()
+            except Exception: pass
        else:
-            resp.raise_for_status()
-            print(f"[Update] '{title}' – {reason} -> OK")
-            _print_diff(found_payload, payload)
+            resp.raise_for_status(); print(f"[Update] '{title}' – {reason} -> OK"); _print_diff(found_payload, payload)
    else:
        print(f"[Skip]   '{title}' – {reason}")
    return action

 # ----- Orchestrierung -----

-def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str:
+def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False, debug_raw: bool = False) -> str:
    info = fetch_page_info(title)
-    pid = info.get("pageid")
-    fullurl = info.get("fullurl") or ""
+    pid = info.get("pageid"); fullurl = info.get("fullurl") or ""
    if not pid:
-        print(f"[Error] pageid für '{title}' nicht gefunden.", file=sys.stderr)
-        return "failed"
+        print(f"[Error] pageid für '{title}' nicht gefunden.", file=sys.stderr); return "failed"
    raw = parse_exercise(title, pid)
+    if debug_raw:
+        print("[Debug] Raw-Keys:", sorted([k for k in raw.keys() if k not in {"wikitext"}]))
    payload = build_payload(raw, fullurl, category, mutate=mutate)
    return upsert_exercise(payload, dry_run=dry_run)


-def process_all(category: str, *, dry_run: bool = False) -> Dict[str, int]:
+def process_all(category: str, *, dry_run: bool = False, debug_raw: bool = False) -> Dict[str, int]:
    stats = {"created": 0, "updated": 0, "skipped": 0, "failed": 0}
    print(f"[Main] Lade Liste der Übungen aus Kategorie '{category}'…")
    pages = fetch_all_pages(category)
    print(f"[Main] {len(pages)} Seiten gefunden.")

-    for title, entry in pages.items():
+    for idx, (title, entry) in enumerate(pages.items(), 1):
        try:
            getter = getattr(entry, "get", None)
-            if callable(getter):
-                pid = getter("pageid")
-                fullurl = getter("fullurl")
-            else:
-                pid = None
-                fullurl = None
-
+            pid = getter("pageid") if callable(getter) else None
+            fullurl = getter("fullurl") if callable(getter) else None
            if not pid:
-                info = fetch_page_info(title)
-                pid = info.get("pageid")
-                fullurl = fullurl or info.get("fullurl")
+                info = fetch_page_info(title); pid = info.get("pageid"); fullurl = fullurl or info.get("fullurl")
            if not pid:
-                print(f"[Skip] '{title}' hat keine pageid")
-                stats["failed"] += 1
-                continue
+                print(f"[Skip] '{title}' hat keine pageid"); stats["failed"] += 1; continue
            raw = parse_exercise(title, pid)
+            if debug_raw and idx <= 5:
+                print(f"[Debug] #{idx} '{title}' Raw-Keys:", sorted([k for k in raw.keys() if k not in {"wikitext"}]))
            payload = build_payload(raw, fullurl or "", category)
            act = upsert_exercise(payload, dry_run=dry_run)
-            if act == "create":
-                stats["created"] += 1
-            elif act == "update":
-                stats["updated"] += 1
-            elif act == "skip":
-                stats["skipped"] += 1
+            stats["created" if act=="create" else "updated" if act=="update" else "skipped"] += 1
        except requests.HTTPError as e:
            code = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
            if code == 404:
-                print(f"[Skip] '{title}': page not found (404)")
-                stats["failed"] += 1
+                print(f"[Skip] '{title}': page not found (404)"); stats["failed"] += 1
            else:
-                print(f"[Error] '{title}': {e}")
-                stats["failed"] += 1
+                print(f"[Error] '{title}': {e}"); stats["failed"] += 1
        except Exception as e:
-            print(f"[Error] '{title}': {e}")
-            stats["failed"] += 1
+            print(f"[Error] '{title}': {e}"); stats["failed"] += 1
    return stats


-def run_smoke_test(title: str, category: str) -> None:
-    print("\n[SmokeTest] Lauf 1/3: CREATE (Erstimport)")
-    act1 = process_one(title, category, mutate=False)
-    print("[SmokeTest] Aktion:", act1)
-
-    print("\n[SmokeTest] Lauf 2/3: SKIP (Wiederholung, unverändert)")
-    act2 = process_one(title, category, mutate=False)
-    print("[SmokeTest] Aktion:", act2)
-
-    print("\n[SmokeTest] Lauf 3/3: UPDATE (simulierte Wiki-Änderung an 'notes')")
-    act3 = process_one(title, category, mutate=True)
-    print("[SmokeTest] Aktion:", act3)
-
-    print("\n[SmokeTest] Zusammenfassung:")
-    print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2))
+def run_smoke_test(title: str, category: str, *, debug_raw: bool = False) -> None:
+    print("\n[SmokeTest] Lauf 1/3: CREATE (Erstimport)"); act1 = process_one(title, category, mutate=False, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act1)
+    print("\n[SmokeTest] Lauf 2/3: SKIP (Wiederholung, unverändert)"); act2 = process_one(title, category, mutate=False, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act2)
+    print("\n[SmokeTest] Lauf 3/3: UPDATE (simulierte Wiki-Änderung an 'notes')"); act3 = process_one(title, category, mutate=True, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act3)
+    print("\n[SmokeTest] Zusammenfassung:"); print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2))

 # ----- Main -----

@ -517,6 +441,7 @@ def main() -> None:
    parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)")
    parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen")
    parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title")
+    parser.add_argument("--debug-raw", action="store_true", help="Zeigt die aus dem Wiki gelesenen Roh-Keys je Seite")
    args = parser.parse_args()

    wiki_health()
@ -528,19 +453,17 @@ def main() -> None:
        try:
            wiki_login(args.username, args.password)
        except Exception as e:
-            print(str(e), file=sys.stderr)
-            sys.exit(1)
+            print(str(e), file=sys.stderr); sys.exit(1)

    if args.smoke_test:
-        run_smoke_test(args.title, args.category)
-        return
+        run_smoke_test(args.title, args.category, debug_raw=args.debug_raw); return

    if args.all:
-        stats = process_all(args.category, dry_run=args.dry_run)
+        stats = process_all(args.category, dry_run=args.dry_run, debug_raw=args.debug_raw)
        print("\n[Stats] created={created} updated={updated} skipped={skipped} failed={failed}".format(**stats))
    else:
        print(f"[Main] Import single exercise: {args.title}")
-        result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run)
+        result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run, debug_raw=args.debug_raw)
        print(f"[Result] {result}")

 if __name__ == "__main__":