diff --git a/scripts/wiki_importer.py b/scripts/wiki_importer.py index 3203a03..a235a92 100644 --- a/scripts/wiki_importer.py +++ b/scripts/wiki_importer.py @@ -1,21 +1,20 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -Module: wiki_importer.py -Beschreibung: -- Importiert Übungen aus dem MediaWiki via FastAPI wiki_router -- Login gegen /import/wiki/login (abschaltbar via --skip-login) -- Titel-Liste via /semantic/pages, Parsing via /parsepage, Info via /info (nur wenn nötig) -- Idempotentes Upsert: external_id="mw:{pageid}", Fingerprint (sha256) über Kernfelder -- Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern -- Smoke-Test (--smoke-test): 3 Läufe (create → skip → update) +wiki_importer.py – v2.3.5 -v2.3.4 – Änderungen ggü. 2.3.3: -- **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen - (z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox"). -- **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über - mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf). -- Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates. +Fix: Einige Felder (discipline, execution, keywords, equipment) wurden in einzelnen Seiten +nicht mehr gefüllt. Ursache: Template-/Key-Varianten (z. B. "Übung Infobox" mit Leerzeichen, +"Geräte/Material", "Schlüsselwörter", "Hilfsmittel", "Ablauf" usw.) wurden vom Matcher +nicht immer erkannt. + +Änderungen ggü. v2.3.4: +- Separater Normalizer für Template-Namen (entfernt Nicht‑Alphanumerika), dadurch matchen auch + Varianten wie "Übung Infobox", "Uebung-Infobox" etc. +- Erweitertes Synonym-Set für Felder: execution/keywords/equipment/discipline u. a. +- Fuzzy‑Fallback: Wenn _get_first() nichts findet, suche Keys, die die Token enthalten + (z. B. "gerate/material" ⇒ equipment). +- Keine API-/CLI-Änderungen. """ import os @@ -32,12 +31,11 @@ import time import unicodedata # ----- Konfiguration / Defaults ----- -load_dotenv() # .env laden, falls vorhanden - -API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") # FastAPI-Wiki-Proxy -EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint -DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen") -DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") +load_dotenv() +API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") +EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") +DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen") +DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60")) # ---- Unicode-/Key-Normalisierung ---- @@ -49,23 +47,30 @@ def _strip_diacritics(s: str) -> str: return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch)) def _norm_key(s: str) -> str: + # Für Parameter-Namen: diakritikfrei + getrimmt + casefold; Sonderzeichen bleiben erhalten, + # damit Kombinationen wie "gerate/material" unterscheidbar sind s = _norm_unicode(s or "") s = _strip_diacritics(s) s = s.strip().casefold() return s -# Template-Aliasse (normalisierte Namen) -TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"} -TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"} -TPL_SKILLDEV = {"skilldevelopment"} +def _norm_tpl(s: str) -> str: + # Für Template-Namen: zusätzlich alle Nicht‑Alphanumerika entfernen + s = _norm_key(s) + return "".join(ch for ch in s if ch.isalnum()) -# Synonyme für Parameter (normalisierte Keys) +# Template-Aliasse (normalisierte Namen, _norm_tpl!) +TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"} +TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"} +TPL_SKILLDEV = {"skilldevelopment"} + +# Synonyme für Parameter (normalisierte Keys via _norm_key) KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"] -KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"] +KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"] KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"] -KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "keywords", "tags"] -KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "material"] -KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline"] +KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags"] +KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"] +KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"] KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"] KEYS_AGE_GROUP = ["altersgruppe"] KEYS_TARGET_GROUP = ["zielgruppe", "target_group"] @@ -74,14 +79,13 @@ KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"] KEYS_METHOD = ["method", "methode"] KEYS_NOTES = ["hinweise", "notes"] -# ---- Hilfsfunktionen ---- +# ---- Wiki-Router Helpers ---- def wiki_health() -> None: r = requests.get(f"{API_BASE_URL}/health", timeout=15) r.raise_for_status() print("[Sanity] Wiki health OK") - def wiki_login(username: str, password: str) -> None: payload = {"username": username, "password": password} r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30) @@ -96,19 +100,18 @@ def wiki_login(username: str, password: str) -> None: raise RuntimeError(f"[Login] {msg}") print("[Login] success") - def fetch_all_pages(category: str) -> Dict[str, Any]: resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp.json() - def fetch_page_info(title: str) -> Dict[str, Any]: r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30) r.raise_for_status() info = r.json() return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} +# ---- Parser ---- def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: print(f"[Parse] Lade '{title}' (ID={pageid})") @@ -123,19 +126,14 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} - # Templates sammeln (robust gegen Varianten) for tpl in wikicode.filter_templates(): - name_raw = str(tpl.name) - name_norm = _norm_key(name_raw) - if name_norm in TPL_UEBUNG_INFOBOX: - for p in tpl.params: - raw[str(p.name).strip()] = str(p.value).strip() - elif name_norm in TPL_UEBUNGSBESCHREIBUNG: + name_norm = _norm_tpl(str(tpl.name)) + if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG: for p in tpl.params: + # Original-Parameternamen übernehmen; Normalisierung passiert später raw[str(p.name).strip()] = str(p.value).strip() elif name_norm in TPL_SKILLDEV: raw.setdefault("capabilities", []) - # Standard-Keys (engl. Template) def _getp(t, k): try: return str(t.get(k).value).strip() @@ -149,10 +147,11 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: lvl_i = 0 if cap: raw["capabilities"].append({"capability": cap, "level": lvl_i}) + raw["wikitext"] = wikitext return raw -# ---- Fingerprint-Unterstützung (stabil) ---- +# ---- Fingerprint (stabil) ---- def _normalize(v: Any) -> str: if v is None: @@ -163,21 +162,18 @@ def _normalize(v: Any) -> str: return json.dumps(v, sort_keys=True, ensure_ascii=False) return str(v).strip() - def _norm_text(s: str) -> str: if s is None: return "" - s = str(s).replace("\u00a0", " ") # NBSP → Space + s = str(s).replace("\u00a0", " ") s = s.strip() - s = " ".join(s.split()) # Collapse whitespace + s = " ".join(s.split()) return s - def _canon_title(t: str) -> str: t = (t or "").strip().replace("_", " ") return t.replace("–", "-").replace("—", "-") - def compute_fingerprint(payload: Dict[str, Any]) -> str: kws = payload.get("keywords") or [] kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) @@ -198,11 +194,10 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str: base = "|".join(_normalize(f) for f in fields) return hashlib.sha256(base.encode("utf-8")).hexdigest() -# ---- Feldauflösung (Synonyme) ---- +# ---- Feldauflösung (Synonyme + Fuzzy) ---- def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]: - return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) } - + return {_norm_key(k): v for k, v in d.items() if isinstance(k, str)} def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any: m = _norm_keymap(d) @@ -212,9 +207,20 @@ def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any: return v return None +def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any: + # Finde einen Key, der alle Tokens (als Teilstring) enthält + m = _norm_keymap(d) + for k, v in m.items(): + if v in (None, ""): + continue + if all(t in k for t in tokens): + return v + return None + +# ---- Payload ---- def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]: - # Exercise.capabilities erwartet Dict[str,int] + # Capabilities -> Dict[str,int] caps_list = raw.get("capabilities", []) capabilities: Dict[str, int] = {} for c in caps_list: @@ -226,10 +232,12 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b except Exception: pass - # Defaults/Fallbacks via Synonyme # summary / execution summary = _get_first(raw, KEYS_SUMMARY) or "" - execution = _get_first(raw, KEYS_EXECUTION) or "" + execution = _get_first(raw, KEYS_EXECUTION) + if execution in (None, ""): + # Fuzzy: Key enthält z. B. "ablauf" oder "durchfuehrung" + execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf",]) or "" # duration duration = _get_first(raw, KEYS_DURATION) @@ -240,12 +248,16 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b # keywords kw_raw = _get_first(raw, KEYS_KEYWORDS) + if kw_raw in (None, ""): + kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or "" keywords: List[str] = [] if isinstance(kw_raw, str): keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] # equipment eq_raw = _get_first(raw, KEYS_EQUIPMENT) + if eq_raw in (None, ""): + eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or "" equipment: List[str] = [] if isinstance(eq_raw, str): equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] @@ -257,6 +269,9 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b notes = (str(notes) + " [auto-update]").strip() discipline = _get_first(raw, KEYS_DISCIPLINE) or "" + if discipline in (None, ""): + discipline = _get_first_fuzzy(raw, ["ubung", "typ"]) or _get_first_fuzzy(raw, ["schwerpunkt"]) or "" + group = _get_first(raw, KEYS_GROUP) or None age_group = _get_first(raw, KEYS_AGE_GROUP) or "" target_group = _get_first(raw, KEYS_TARGET_GROUP) or "" @@ -291,7 +306,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b payload["fingerprint"] = compute_fingerprint(payload) return payload -# ---- Lookup/Upsert ---- +# ---- Lookup/Upsert (wie v2.3.4) ---- def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]: url = f"{EXERCISE_API}/by-external-id" @@ -374,7 +389,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: action, reason = "create", "unexpected lookup type" if dry_run: - print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}") + print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}") if action == "update": _print_diff(found_payload, payload) return action @@ -402,10 +417,10 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: pass else: resp.raise_for_status() - print(f"[Update] '{title}' - {reason} -> OK") + print(f"[Update] '{title}' – {reason} -> OK") _print_diff(found_payload, payload) else: - print(f"[Skip] '{title}' - {reason}") + print(f"[Skip] '{title}' – {reason}") return action # ----- Orchestrierung ----- @@ -485,7 +500,6 @@ def run_smoke_test(title: str, category: str) -> None: print("\n[SmokeTest] Zusammenfassung:") print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2)) - # ----- Main ----- def main() -> None: @@ -500,10 +514,8 @@ def main() -> None: parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title") args = parser.parse_args() - # Sanity wiki_health() - # Login (sofern nicht explizit übersprungen) if not args.skip_login: if not args.username or not args.password: print("[Login] Fehler: fehlende Credentials. Setze .env (WIKI_BOT_USER/WIKI_BOT_PASSWORD) oder CLI --username/--password.", file=sys.stderr) @@ -526,6 +538,5 @@ def main() -> None: result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run) print(f"[Result] {result}") - if __name__ == "__main__": main()