From e12fd8f96adbda47c4dac32c7ad7117344d9647a Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 11 Aug 2025 15:21:55 +0200 Subject: [PATCH] scripts/wiki_importer.py aktualisiert --- scripts/wiki_importer.py | 54 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/scripts/wiki_importer.py b/scripts/wiki_importer.py index a235a92..d541fd4 100644 --- a/scripts/wiki_importer.py +++ b/scripts/wiki_importer.py @@ -1,20 +1,14 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -wiki_importer.py – v2.3.5 +wiki_importer.py – v2.3.6 -Fix: Einige Felder (discipline, execution, keywords, equipment) wurden in einzelnen Seiten -nicht mehr gefüllt. Ursache: Template-/Key-Varianten (z. B. "Übung Infobox" mit Leerzeichen, -"Geräte/Material", "Schlüsselwörter", "Hilfsmittel", "Ablauf" usw.) wurden vom Matcher -nicht immer erkannt. +Fixes ggü. v2.3.5: +- **Equipment**: eigenes Template `{{Hilfsmittel}}` wird jetzt ausgewertet → `equipment` wird gesetzt. +- **Keywords**: Synonyme bleiben; zusätzlich kleine Robustheit für ungewöhnliche Schreibungen. +- **imported_at**: wird nun **sowohl bei UPDATE als auch bei CREATE** gesetzt, damit das Feld immer gefüllt ist. -Änderungen ggü. v2.3.4: -- Separater Normalizer für Template-Namen (entfernt Nicht‑Alphanumerika), dadurch matchen auch - Varianten wie "Übung Infobox", "Uebung-Infobox" etc. -- Erweitertes Synonym-Set für Felder: execution/keywords/equipment/discipline u. a. -- Fuzzy‑Fallback: Wenn _get_first() nichts findet, suche Keys, die die Token enthalten - (z. B. "gerate/material" ⇒ equipment). -- Keine API-/CLI-Änderungen. +Keine API-/CLI-Änderungen. """ import os @@ -47,15 +41,12 @@ def _strip_diacritics(s: str) -> str: return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch)) def _norm_key(s: str) -> str: - # Für Parameter-Namen: diakritikfrei + getrimmt + casefold; Sonderzeichen bleiben erhalten, - # damit Kombinationen wie "gerate/material" unterscheidbar sind s = _norm_unicode(s or "") s = _strip_diacritics(s) s = s.strip().casefold() return s def _norm_tpl(s: str) -> str: - # Für Template-Namen: zusätzlich alle Nicht‑Alphanumerika entfernen s = _norm_key(s) return "".join(ch for ch in s if ch.isalnum()) @@ -63,12 +54,13 @@ def _norm_tpl(s: str) -> str: TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"} TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"} TPL_SKILLDEV = {"skilldevelopment"} +TPL_HILFSMITTEL = {"hilfsmittel"} # << NEU # Synonyme für Parameter (normalisierte Keys via _norm_key) KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"] KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"] KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"] -KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags"] +KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags", "schluesselwort", "schlüsselwort"] KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"] KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"] KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"] @@ -130,7 +122,6 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: name_norm = _norm_tpl(str(tpl.name)) if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG: for p in tpl.params: - # Original-Parameternamen übernehmen; Normalisierung passiert später raw[str(p.name).strip()] = str(p.value).strip() elif name_norm in TPL_SKILLDEV: raw.setdefault("capabilities", []) @@ -147,6 +138,9 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: lvl_i = 0 if cap: raw["capabilities"].append({"capability": cap, "level": lvl_i}) + elif name_norm in TPL_HILFSMITTEL: # << NEU: Template für Ausrüstung + for p in tpl.params: + raw[str(p.name).strip()] = str(p.value).strip() raw["wikitext"] = wikitext return raw @@ -176,6 +170,8 @@ def _canon_title(t: str) -> str: def compute_fingerprint(payload: Dict[str, Any]) -> str: kws = payload.get("keywords") or [] + # robust gegen Strichvarianten und Doppelkommas + kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws] kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) dur = payload.get("duration_minutes") or 0 try: @@ -208,7 +204,6 @@ def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any: return None def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any: - # Finde einen Key, der alle Tokens (als Teilstring) enthält m = _norm_keymap(d) for k, v in m.items(): if v in (None, ""): @@ -236,8 +231,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b summary = _get_first(raw, KEYS_SUMMARY) or "" execution = _get_first(raw, KEYS_EXECUTION) if execution in (None, ""): - # Fuzzy: Key enthält z. B. "ablauf" oder "durchfuehrung" - execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf",]) or "" + execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf"]) or "" # duration duration = _get_first(raw, KEYS_DURATION) @@ -252,7 +246,9 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or "" keywords: List[str] = [] if isinstance(kw_raw, str): - keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] + # robuste Auftrennung; ignoriert doppelte Kommas u. ä. + parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")] + keywords = [p for p in parts if p] # equipment eq_raw = _get_first(raw, KEYS_EQUIPMENT) @@ -260,7 +256,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or "" equipment: List[str] = [] if isinstance(eq_raw, str): - equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] + equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()] elif isinstance(eq_raw, list): equipment = [str(e).strip() for e in eq_raw if str(e).strip()] @@ -306,7 +302,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b payload["fingerprint"] = compute_fingerprint(payload) return payload -# ---- Lookup/Upsert (wie v2.3.4) ---- +# ---- Lookup/Upsert ---- def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]: url = f"{EXERCISE_API}/by-external-id" @@ -365,6 +361,10 @@ def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None: print("[Diff] (none in hash fields)") +def _now_iso() -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + + def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: title = payload.get("title", "") ext_id = payload.get("external_id") @@ -389,13 +389,15 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: action, reason = "create", "unexpected lookup type" if dry_run: - print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}") + print(f"[DryRun] {action.UPPER():6} '{title}' ({ext_id}) – {reason}") if action == "update": _print_diff(found_payload, payload) return action if action == "create": - resp = requests.post(EXERCISE_API, json=payload, timeout=REQUEST_TIMEOUT) + payload2 = dict(payload) + payload2["imported_at"] = _now_iso() # << immer setzen + resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT) if resp.status_code == 422: print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}") try: @@ -407,7 +409,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: print(f"[Create] '{title}' – {reason} -> OK") elif action == "update": payload2 = dict(payload) - payload2["imported_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + payload2["imported_at"] = _now_iso() # << setzen bei Update resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT) if resp.status_code == 422: print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")