scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s

This commit is contained in:
Lars 2025-08-11 15:21:55 +02:00
parent cf085f8ef0
commit e12fd8f96a

View File

@ -1,20 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
wiki_importer.py v2.3.5 wiki_importer.py v2.3.6
Fix: Einige Felder (discipline, execution, keywords, equipment) wurden in einzelnen Seiten Fixes ggü. v2.3.5:
nicht mehr gefüllt. Ursache: Template-/Key-Varianten (z. B. "Übung Infobox" mit Leerzeichen, - **Equipment**: eigenes Template `{{Hilfsmittel}}` wird jetzt ausgewertet `equipment` wird gesetzt.
"Geräte/Material", "Schlüsselwörter", "Hilfsmittel", "Ablauf" usw.) wurden vom Matcher - **Keywords**: Synonyme bleiben; zusätzlich kleine Robustheit für ungewöhnliche Schreibungen.
nicht immer erkannt. - **imported_at**: wird nun **sowohl bei UPDATE als auch bei CREATE** gesetzt, damit das Feld immer gefüllt ist.
Änderungen ggü. v2.3.4: Keine API-/CLI-Änderungen.
- Separater Normalizer für Template-Namen (entfernt NichtAlphanumerika), dadurch matchen auch
Varianten wie "Übung Infobox", "Uebung-Infobox" etc.
- Erweitertes Synonym-Set für Felder: execution/keywords/equipment/discipline u. a.
- FuzzyFallback: Wenn _get_first() nichts findet, suche Keys, die die Token enthalten
(z. B. "gerate/material" equipment).
- Keine API-/CLI-Änderungen.
""" """
import os import os
@ -47,15 +41,12 @@ def _strip_diacritics(s: str) -> str:
return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch)) return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
def _norm_key(s: str) -> str: def _norm_key(s: str) -> str:
# Für Parameter-Namen: diakritikfrei + getrimmt + casefold; Sonderzeichen bleiben erhalten,
# damit Kombinationen wie "gerate/material" unterscheidbar sind
s = _norm_unicode(s or "") s = _norm_unicode(s or "")
s = _strip_diacritics(s) s = _strip_diacritics(s)
s = s.strip().casefold() s = s.strip().casefold()
return s return s
def _norm_tpl(s: str) -> str: def _norm_tpl(s: str) -> str:
# Für Template-Namen: zusätzlich alle NichtAlphanumerika entfernen
s = _norm_key(s) s = _norm_key(s)
return "".join(ch for ch in s if ch.isalnum()) return "".join(ch for ch in s if ch.isalnum())
@ -63,12 +54,13 @@ def _norm_tpl(s: str) -> str:
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"} TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"}
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"} TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"}
TPL_SKILLDEV = {"skilldevelopment"} TPL_SKILLDEV = {"skilldevelopment"}
TPL_HILFSMITTEL = {"hilfsmittel"} # << NEU
# Synonyme für Parameter (normalisierte Keys via _norm_key) # Synonyme für Parameter (normalisierte Keys via _norm_key)
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"] KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"] KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"]
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"] KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags"] KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags", "schluesselwort", "schlüsselwort"]
KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"] KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"]
KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"] KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"]
KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"] KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"]
@ -130,7 +122,6 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
name_norm = _norm_tpl(str(tpl.name)) name_norm = _norm_tpl(str(tpl.name))
if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG: if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG:
for p in tpl.params: for p in tpl.params:
# Original-Parameternamen übernehmen; Normalisierung passiert später
raw[str(p.name).strip()] = str(p.value).strip() raw[str(p.name).strip()] = str(p.value).strip()
elif name_norm in TPL_SKILLDEV: elif name_norm in TPL_SKILLDEV:
raw.setdefault("capabilities", []) raw.setdefault("capabilities", [])
@ -147,6 +138,9 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
lvl_i = 0 lvl_i = 0
if cap: if cap:
raw["capabilities"].append({"capability": cap, "level": lvl_i}) raw["capabilities"].append({"capability": cap, "level": lvl_i})
elif name_norm in TPL_HILFSMITTEL: # << NEU: Template für Ausrüstung
for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip()
raw["wikitext"] = wikitext raw["wikitext"] = wikitext
return raw return raw
@ -176,6 +170,8 @@ def _canon_title(t: str) -> str:
def compute_fingerprint(payload: Dict[str, Any]) -> str: def compute_fingerprint(payload: Dict[str, Any]) -> str:
kws = payload.get("keywords") or [] kws = payload.get("keywords") or []
# robust gegen Strichvarianten und Doppelkommas
kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws]
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
dur = payload.get("duration_minutes") or 0 dur = payload.get("duration_minutes") or 0
try: try:
@ -208,7 +204,6 @@ def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
return None return None
def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any: def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any:
# Finde einen Key, der alle Tokens (als Teilstring) enthält
m = _norm_keymap(d) m = _norm_keymap(d)
for k, v in m.items(): for k, v in m.items():
if v in (None, ""): if v in (None, ""):
@ -236,8 +231,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
summary = _get_first(raw, KEYS_SUMMARY) or "" summary = _get_first(raw, KEYS_SUMMARY) or ""
execution = _get_first(raw, KEYS_EXECUTION) execution = _get_first(raw, KEYS_EXECUTION)
if execution in (None, ""): if execution in (None, ""):
# Fuzzy: Key enthält z. B. "ablauf" oder "durchfuehrung" execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf"]) or ""
execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf",]) or ""
# duration # duration
duration = _get_first(raw, KEYS_DURATION) duration = _get_first(raw, KEYS_DURATION)
@ -252,7 +246,9 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or "" kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or ""
keywords: List[str] = [] keywords: List[str] = []
if isinstance(kw_raw, str): if isinstance(kw_raw, str):
keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] # robuste Auftrennung; ignoriert doppelte Kommas u. ä.
parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
keywords = [p for p in parts if p]
# equipment # equipment
eq_raw = _get_first(raw, KEYS_EQUIPMENT) eq_raw = _get_first(raw, KEYS_EQUIPMENT)
@ -260,7 +256,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or "" eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or ""
equipment: List[str] = [] equipment: List[str] = []
if isinstance(eq_raw, str): if isinstance(eq_raw, str):
equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
elif isinstance(eq_raw, list): elif isinstance(eq_raw, list):
equipment = [str(e).strip() for e in eq_raw if str(e).strip()] equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
@ -306,7 +302,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
payload["fingerprint"] = compute_fingerprint(payload) payload["fingerprint"] = compute_fingerprint(payload)
return payload return payload
# ---- Lookup/Upsert (wie v2.3.4) ---- # ---- Lookup/Upsert ----
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]: def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
url = f"{EXERCISE_API}/by-external-id" url = f"{EXERCISE_API}/by-external-id"
@ -365,6 +361,10 @@ def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
print("[Diff] (none in hash fields)") print("[Diff] (none in hash fields)")
def _now_iso() -> str:
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
title = payload.get("title", "<ohne Titel>") title = payload.get("title", "<ohne Titel>")
ext_id = payload.get("external_id") ext_id = payload.get("external_id")
@ -389,13 +389,15 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
action, reason = "create", "unexpected lookup type" action, reason = "create", "unexpected lookup type"
if dry_run: if dry_run:
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) {reason}") print(f"[DryRun] {action.UPPER():6} '{title}' ({ext_id}) {reason}")
if action == "update": if action == "update":
_print_diff(found_payload, payload) _print_diff(found_payload, payload)
return action return action
if action == "create": if action == "create":
resp = requests.post(EXERCISE_API, json=payload, timeout=REQUEST_TIMEOUT) payload2 = dict(payload)
payload2["imported_at"] = _now_iso() # << immer setzen
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
if resp.status_code == 422: if resp.status_code == 422:
print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}") print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}")
try: try:
@ -407,7 +409,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
print(f"[Create] '{title}' {reason} -> OK") print(f"[Create] '{title}' {reason} -> OK")
elif action == "update": elif action == "update":
payload2 = dict(payload) payload2 = dict(payload)
payload2["imported_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) payload2["imported_at"] = _now_iso() # << setzen bei Update
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT) resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
if resp.status_code == 422: if resp.status_code == 422:
print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}") print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")