scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s

This commit is contained in:
Lars 2025-08-11 14:08:39 +02:00
parent 2567d8c786
commit 6bab3cdf04

View File

@ -10,22 +10,18 @@ Beschreibung:
- Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern
- Smoke-Test (--smoke-test): 3 Läufe (create skip update)
v2.3.3 Änderungen ggü. 2.3.2:
- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung):
Titel: _ zu Leerzeichen, Gedankenstriche Bindestrich
summary/execution/notes: Whitespace kollabieren
keywords: dedupliziert (case-insensitiv) & sortiert
duration_minutes: sicher als int
- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload
(verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint)
- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update
- Kein API-/CLI-Bruch
v2.3.4 Änderungen ggü. 2.3.3:
- **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen
(z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox").
- **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über
mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf).
- Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates.
"""
import os
import sys
import argparse
from typing import Dict, Any, Tuple, Optional
from typing import Dict, Any, Tuple, Optional, List
from collections.abc import Mapping
import requests
import mwparserfromhell
@ -33,6 +29,7 @@ from dotenv import load_dotenv
import hashlib
import json
import time
import unicodedata
# ----- Konfiguration / Defaults -----
load_dotenv() # .env laden, falls vorhanden
@ -43,7 +40,41 @@ DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
# ---- Hilfsfunktionen für Wiki-Router ----
# ---- Unicode-/Key-Normalisierung ----
def _norm_unicode(s: str) -> str:
return unicodedata.normalize("NFKC", s)
def _strip_diacritics(s: str) -> str:
return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
def _norm_key(s: str) -> str:
s = _norm_unicode(s or "")
s = _strip_diacritics(s)
s = s.strip().casefold()
return s
# Template-Aliasse (normalisierte Namen)
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"}
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"}
TPL_SKILLDEV = {"skilldevelopment"}
# Synonyme für Parameter (normalisierte Keys)
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"]
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "keywords", "tags"]
KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "material"]
KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline"]
KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"]
KEYS_AGE_GROUP = ["altersgruppe"]
KEYS_TARGET_GROUP = ["zielgruppe", "target_group"]
KEYS_PURPOSE = ["ziel", "zweck", "purpose"]
KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"]
KEYS_METHOD = ["method", "methode"]
KEYS_NOTES = ["hinweise", "notes"]
# ---- Hilfsfunktionen ----
def wiki_health() -> None:
r = requests.get(f"{API_BASE_URL}/health", timeout=15)
@ -91,30 +122,36 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
wikicode = mwparserfromhell.parse(wikitext)
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
# Templates sammeln (robust gegen Varianten)
for tpl in wikicode.filter_templates():
name = str(tpl.name).strip()
if name == "ÜbungInfoBox":
name_raw = str(tpl.name)
name_norm = _norm_key(name_raw)
if name_norm in TPL_UEBUNG_INFOBOX:
for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip()
elif name == "Übungsbeschreibung":
elif name_norm in TPL_UEBUNGSBESCHREIBUNG:
for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip()
elif name == "SkillDevelopment":
elif name_norm in TPL_SKILLDEV:
raw.setdefault("capabilities", [])
# Standard-Keys (engl. Template)
def _getp(t, k):
try:
return str(t.get(k).value).strip()
except Exception:
return ""
cap = _getp(tpl, "PrimaryCapability")
lvl = _getp(tpl, "CapabilityLevel")
try:
cap = str(tpl.get("PrimaryCapability").value).strip()
lvl_i = int(lvl)
except Exception:
cap = ""
try:
lvl = int(str(tpl.get("CapabilityLevel").value).strip())
except Exception:
lvl = 0
lvl_i = 0
if cap:
raw["capabilities"].append({"capability": cap, "level": lvl})
raw["capabilities"].append({"capability": cap, "level": lvl_i})
raw["wikitext"] = wikitext
return raw
# ---- Fingerprint-Unterstützung (stabil) ----
def _normalize(v: Any) -> str:
@ -138,22 +175,17 @@ def _norm_text(s: str) -> str:
def _canon_title(t: str) -> str:
t = (t or "").strip().replace("_", " ")
# Gedankenstriche vereinheitlichen
return t.replace("", "-").replace("", "-")
def compute_fingerprint(payload: Dict[str, Any]) -> str:
# keywords stabilisieren: trim, dedupe (case-insensitiv), sort
kws = payload.get("keywords") or []
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
# dauer als int
dur = payload.get("duration_minutes") or 0
try:
dur = int(round(float(dur)))
except Exception:
dur = 0
fields = [
_canon_title(payload.get("title", "")),
_norm_text(payload.get("summary", "")),
@ -166,11 +198,25 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
base = "|".join(_normalize(f) for f in fields)
return hashlib.sha256(base.encode("utf-8")).hexdigest()
# ---- Feldauflösung (Synonyme) ----
def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) }
def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
m = _norm_keymap(d)
for c in candidates:
v = m.get(c)
if v not in (None, ""):
return v
return None
def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
# Exercise.capabilities erwartet Dict[str,int]
caps_list = raw.get("capabilities", [])
capabilities = {}
capabilities: Dict[str, int] = {}
for c in caps_list:
cap = c.get("capability")
lvl = c.get("level")
@ -180,47 +226,63 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
except Exception:
pass
# Defaults/Fallbacks
try:
duration = float(raw.get("Dauer", 0) or 0)
except Exception:
duration = 0.0
# Defaults/Fallbacks via Synonyme
# summary / execution
summary = _get_first(raw, KEYS_SUMMARY) or ""
execution = _get_first(raw, KEYS_EXECUTION) or ""
keywords = []
kw_raw = raw.get("Schlüsselworte", "")
# duration
duration = _get_first(raw, KEYS_DURATION)
try:
duration_f = float(duration or 0)
except Exception:
duration_f = 0.0
# keywords
kw_raw = _get_first(raw, KEYS_KEYWORDS)
keywords: List[str] = []
if isinstance(kw_raw, str):
keywords = [k.strip() for k in kw_raw.split(",") if k.strip()]
equipment = []
eq_raw = raw.get("equipment", [])
# equipment
eq_raw = _get_first(raw, KEYS_EQUIPMENT)
equipment: List[str] = []
if isinstance(eq_raw, str):
equipment = [e.strip() for e in eq_raw.split(",") if e.strip()]
elif isinstance(eq_raw, list):
equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
notes = raw.get("Hinweise", "") or ""
notes = _get_first(raw, KEYS_NOTES) or ""
if mutate:
notes = (notes + " [auto-update]").strip()
notes = (str(notes) + " [auto-update]").strip()
discipline = _get_first(raw, KEYS_DISCIPLINE) or ""
group = _get_first(raw, KEYS_GROUP) or None
age_group = _get_first(raw, KEYS_AGE_GROUP) or ""
target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
purpose = _get_first(raw, KEYS_PURPOSE) or ""
preparation = _get_first(raw, KEYS_PREPARATION) or ""
method = _get_first(raw, KEYS_METHOD) or ""
payload: Dict[str, Any] = {
"title": raw.get("title") or "",
"summary": raw.get("Summary", "") or "",
"short_description": raw.get("Summary", "") or "",
"summary": str(summary) or "",
"short_description": str(summary) or "",
"keywords": keywords,
"link": fullurl or "",
"discipline": raw.get("Übungstyp", "") or "",
"group": raw.get("Gruppengröße", "") or None,
"age_group": raw.get("Altersgruppe", "") or "",
"target_group": raw.get("Zielgruppe", "") or "",
"discipline": str(discipline) or "",
"group": str(group) if group else None,
"age_group": str(age_group) or "",
"target_group": str(target_group) or "",
"min_participants": 1,
"duration_minutes": int(round(duration)),
"duration_minutes": int(round(duration_f)),
"capabilities": capabilities,
"category": category or "",
"purpose": raw.get("Ziel", "") or "",
"execution": raw.get("Durchführung", "") or "",
"notes": notes,
"preparation": raw.get("RefMethode", "") or "",
"method": raw.get("method", "") or "",
"purpose": str(purpose) or "",
"execution": str(execution) or "",
"notes": str(notes) or "",
"preparation": str(preparation) or "",
"method": str(method) or "",
"equipment": equipment,
"fullurl": fullurl or "",
"external_id": f"mw:{raw.get('pageid')}",
@ -229,6 +291,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
payload["fingerprint"] = compute_fingerprint(payload)
return payload
# ---- Lookup/Upsert ----
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
url = f"{EXERCISE_API}/by-external-id"
@ -256,6 +319,37 @@ def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]:
}
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
b = {k: before.get(k) for k in keys}
a = {k: after.get(k) for k in keys}
def _kws(x):
return sorted({(k or "").strip() for k in (x or [])}, key=str.casefold)
b_norm = {
"title": _canon_title(b.get("title")),
"summary": _norm_text(b.get("summary")),
"execution": _norm_text(b.get("execution")),
"notes": _norm_text(b.get("notes")),
"duration_minutes": b.get("duration_minutes"),
"capabilities": b.get("capabilities"),
"keywords": _kws(b.get("keywords")),
}
a_norm = {
"title": _canon_title(a.get("title")),
"summary": _norm_text(a.get("summary")),
"execution": _norm_text(a.get("execution")),
"notes": _norm_text(a.get("notes")),
"duration_minutes": a.get("duration_minutes"),
"capabilities": a.get("capabilities"),
"keywords": _kws(a.get("keywords")),
}
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
if diff:
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
else:
print("[Diff] (none in hash fields)")
def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
title = payload.get("title", "<ohne Titel>")
ext_id = payload.get("external_id")
@ -280,7 +374,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
action, reason = "create", "unexpected lookup type"
if dry_run:
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) {reason}")
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}")
if action == "update":
_print_diff(found_payload, payload)
return action
@ -308,44 +402,12 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
pass
else:
resp.raise_for_status()
print(f"[Update] '{title}' {reason} -> OK")
print(f"[Update] '{title}' - {reason} -> OK")
_print_diff(found_payload, payload)
else:
print(f"[Skip] '{title}' {reason}")
print(f"[Skip] '{title}' - {reason}")
return action
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
"""Kleines Feld-Diff für die Hash-Felder (Diagnose)."""
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
b = {k: before.get(k) for k in keys}
a = {k: after.get(k) for k in keys}
# für bessere Lesbarkeit normalisieren wir die Textfelder
b_norm = {
"title": _canon_title(b.get("title")),
"summary": _norm_text(b.get("summary")),
"execution": _norm_text(b.get("execution")),
"notes": _norm_text(b.get("notes")),
"duration_minutes": b.get("duration_minutes"),
"capabilities": b.get("capabilities"),
"keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold),
}
a_norm = {
"title": _canon_title(a.get("title")),
"summary": _norm_text(a.get("summary")),
"execution": _norm_text(a.get("execution")),
"notes": _norm_text(a.get("notes")),
"duration_minutes": a.get("duration_minutes"),
"capabilities": a.get("capabilities"),
"keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold),
}
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
if diff:
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
else:
print("[Diff] (none in hash fields)")
# ----- Orchestrierung -----
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: