scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s

This commit is contained in:
Lars 2025-08-11 15:33:15 +02:00
parent e12fd8f96a
commit 34320b46d9

View File

@ -1,14 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
wiki_importer.py v2.3.6 wiki_importer.py v2.3.7
Fixes ggü. v2.3.5: Fixes ggü. v2.3.6:
- **Equipment**: eigenes Template `{{Hilfsmittel}}` wird jetzt ausgewertet `equipment` wird gesetzt. - **Keywords/EQUIPMENT/DISCIPLINE u.a. wurden teils nicht erkannt**: Bugfix in `_get_first()`
- **Keywords**: Synonyme bleiben; zusätzlich kleine Robustheit für ungewöhnliche Schreibungen. Kandidatenschlüssel werden jetzt ebenfalls normalisiert (`_norm_key(c)`), damit
- **imported_at**: wird nun **sowohl bei UPDATE als auch bei CREATE** gesetzt, damit das Feld immer gefüllt ist. `Schlüsselworte` (aus dem Wiki) zuverlässig matcht.
- `_get_first_fuzzy()` normalisiert die Such-Tokens.
- Kleine Bugfixes/Polish: `action.upper()` im Dry-Run, sanftere Keywords-Splittung.
Keine API-/CLI-Änderungen. Hinweis: Keine API-/CLI-Änderungen. Parser unterstützt weiterhin `{{Hilfsmittel}}`.
""" """
import os import os
@ -54,9 +56,9 @@ def _norm_tpl(s: str) -> str:
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"} TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"}
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"} TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"}
TPL_SKILLDEV = {"skilldevelopment"} TPL_SKILLDEV = {"skilldevelopment"}
TPL_HILFSMITTEL = {"hilfsmittel"} # << NEU TPL_HILFSMITTEL = {"hilfsmittel"}
# Synonyme für Parameter (normalisierte Keys via _norm_key) # Synonyme (werden im Code nochmals normalisiert)
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"] KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"] KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"]
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"] KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
@ -138,7 +140,7 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
lvl_i = 0 lvl_i = 0
if cap: if cap:
raw["capabilities"].append({"capability": cap, "level": lvl_i}) raw["capabilities"].append({"capability": cap, "level": lvl_i})
elif name_norm in TPL_HILFSMITTEL: # << NEU: Template für Ausrüstung elif name_norm in TPL_HILFSMITTEL:
for p in tpl.params: for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip() raw[str(p.name).strip()] = str(p.value).strip()
@ -170,7 +172,7 @@ def _canon_title(t: str) -> str:
def compute_fingerprint(payload: Dict[str, Any]) -> str: def compute_fingerprint(payload: Dict[str, Any]) -> str:
kws = payload.get("keywords") or [] kws = payload.get("keywords") or []
# robust gegen Strichvarianten und Doppelkommas # Strichvarianten normalisieren
kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws] kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws]
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
dur = payload.get("duration_minutes") or 0 dur = payload.get("duration_minutes") or 0
@ -198,17 +200,18 @@ def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any: def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
m = _norm_keymap(d) m = _norm_keymap(d)
for c in candidates: for c in candidates:
v = m.get(c) v = m.get(_norm_key(c)) # << Bugfix: Kandidaten ebenfalls normalisieren
if v not in (None, ""): if v not in (None, ""):
return v return v
return None return None
def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any: def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any:
m = _norm_keymap(d) m = _norm_keymap(d)
toks = [_norm_key(t) for t in tokens]
for k, v in m.items(): for k, v in m.items():
if v in (None, ""): if v in (None, ""):
continue continue
if all(t in k for t in tokens): if all(t in k for t in toks):
return v return v
return None return None
@ -246,7 +249,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or "" kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or ""
keywords: List[str] = [] keywords: List[str] = []
if isinstance(kw_raw, str): if isinstance(kw_raw, str):
# robuste Auftrennung; ignoriert doppelte Kommas u. ä. # robuste Auftrennung; ignoriert doppelte Kommas/Zeilenumbrüche
parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")] parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
keywords = [p for p in parts if p] keywords = [p for p in parts if p]
@ -389,14 +392,14 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
action, reason = "create", "unexpected lookup type" action, reason = "create", "unexpected lookup type"
if dry_run: if dry_run:
print(f"[DryRun] {action.UPPER():6} '{title}' ({ext_id}) {reason}") print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) {reason}")
if action == "update": if action == "update":
_print_diff(found_payload, payload) _print_diff(found_payload, payload)
return action return action
if action == "create": if action == "create":
payload2 = dict(payload) payload2 = dict(payload)
payload2["imported_at"] = _now_iso() # << immer setzen payload2["imported_at"] = _now_iso()
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT) resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
if resp.status_code == 422: if resp.status_code == 422:
print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}") print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}")
@ -409,7 +412,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
print(f"[Create] '{title}' {reason} -> OK") print(f"[Create] '{title}' {reason} -> OK")
elif action == "update": elif action == "update":
payload2 = dict(payload) payload2 = dict(payload)
payload2["imported_at"] = _now_iso() # << setzen bei Update payload2["imported_at"] = _now_iso()
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT) resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
if resp.status_code == 422: if resp.status_code == 422:
print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}") print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")