scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s

This commit is contained in:
Lars 2025-08-11 14:08:39 +02:00
parent 2567d8c786
commit 6bab3cdf04

View File

@ -10,22 +10,18 @@ Beschreibung:
- Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern - Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern
- Smoke-Test (--smoke-test): 3 Läufe (create skip update) - Smoke-Test (--smoke-test): 3 Läufe (create skip update)
v2.3.3 Änderungen ggü. 2.3.2: v2.3.4 Änderungen ggü. 2.3.3:
- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung): - **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen
Titel: _ zu Leerzeichen, Gedankenstriche Bindestrich (z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox").
summary/execution/notes: Whitespace kollabieren - **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über
keywords: dedupliziert (case-insensitiv) & sortiert mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf).
duration_minutes: sicher als int - Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates.
- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload
(verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint)
- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update
- Kein API-/CLI-Bruch
""" """
import os import os
import sys import sys
import argparse import argparse
from typing import Dict, Any, Tuple, Optional from typing import Dict, Any, Tuple, Optional, List
from collections.abc import Mapping from collections.abc import Mapping
import requests import requests
import mwparserfromhell import mwparserfromhell
@ -33,6 +29,7 @@ from dotenv import load_dotenv
import hashlib import hashlib
import json import json
import time import time
import unicodedata
# ----- Konfiguration / Defaults ----- # ----- Konfiguration / Defaults -----
load_dotenv() # .env laden, falls vorhanden load_dotenv() # .env laden, falls vorhanden
@ -43,7 +40,41 @@ DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60")) REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
# ---- Hilfsfunktionen für Wiki-Router ---- # ---- Unicode-/Key-Normalisierung ----
def _norm_unicode(s: str) -> str:
return unicodedata.normalize("NFKC", s)
def _strip_diacritics(s: str) -> str:
return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
def _norm_key(s: str) -> str:
s = _norm_unicode(s or "")
s = _strip_diacritics(s)
s = s.strip().casefold()
return s
# Template-Aliasse (normalisierte Namen)
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"}
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"}
TPL_SKILLDEV = {"skilldevelopment"}
# Synonyme für Parameter (normalisierte Keys)
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"]
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "keywords", "tags"]
KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "material"]
KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline"]
KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"]
KEYS_AGE_GROUP = ["altersgruppe"]
KEYS_TARGET_GROUP = ["zielgruppe", "target_group"]
KEYS_PURPOSE = ["ziel", "zweck", "purpose"]
KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"]
KEYS_METHOD = ["method", "methode"]
KEYS_NOTES = ["hinweise", "notes"]
# ---- Hilfsfunktionen ----
def wiki_health() -> None: def wiki_health() -> None:
r = requests.get(f"{API_BASE_URL}/health", timeout=15) r = requests.get(f"{API_BASE_URL}/health", timeout=15)
@ -91,30 +122,36 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
wikicode = mwparserfromhell.parse(wikitext) wikicode = mwparserfromhell.parse(wikitext)
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
# Templates sammeln (robust gegen Varianten)
for tpl in wikicode.filter_templates(): for tpl in wikicode.filter_templates():
name = str(tpl.name).strip() name_raw = str(tpl.name)
if name == "ÜbungInfoBox": name_norm = _norm_key(name_raw)
if name_norm in TPL_UEBUNG_INFOBOX:
for p in tpl.params: for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip() raw[str(p.name).strip()] = str(p.value).strip()
elif name == "Übungsbeschreibung": elif name_norm in TPL_UEBUNGSBESCHREIBUNG:
for p in tpl.params: for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip() raw[str(p.name).strip()] = str(p.value).strip()
elif name == "SkillDevelopment": elif name_norm in TPL_SKILLDEV:
raw.setdefault("capabilities", []) raw.setdefault("capabilities", [])
# Standard-Keys (engl. Template)
def _getp(t, k):
try:
return str(t.get(k).value).strip()
except Exception:
return ""
cap = _getp(tpl, "PrimaryCapability")
lvl = _getp(tpl, "CapabilityLevel")
try: try:
cap = str(tpl.get("PrimaryCapability").value).strip() lvl_i = int(lvl)
except Exception: except Exception:
cap = "" lvl_i = 0
try:
lvl = int(str(tpl.get("CapabilityLevel").value).strip())
except Exception:
lvl = 0
if cap: if cap:
raw["capabilities"].append({"capability": cap, "level": lvl}) raw["capabilities"].append({"capability": cap, "level": lvl_i})
raw["wikitext"] = wikitext raw["wikitext"] = wikitext
return raw return raw
# ---- Fingerprint-Unterstützung (stabil) ---- # ---- Fingerprint-Unterstützung (stabil) ----
def _normalize(v: Any) -> str: def _normalize(v: Any) -> str:
@ -138,22 +175,17 @@ def _norm_text(s: str) -> str:
def _canon_title(t: str) -> str: def _canon_title(t: str) -> str:
t = (t or "").strip().replace("_", " ") t = (t or "").strip().replace("_", " ")
# Gedankenstriche vereinheitlichen
return t.replace("", "-").replace("", "-") return t.replace("", "-").replace("", "-")
def compute_fingerprint(payload: Dict[str, Any]) -> str: def compute_fingerprint(payload: Dict[str, Any]) -> str:
# keywords stabilisieren: trim, dedupe (case-insensitiv), sort
kws = payload.get("keywords") or [] kws = payload.get("keywords") or []
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
# dauer als int
dur = payload.get("duration_minutes") or 0 dur = payload.get("duration_minutes") or 0
try: try:
dur = int(round(float(dur))) dur = int(round(float(dur)))
except Exception: except Exception:
dur = 0 dur = 0
fields = [ fields = [
_canon_title(payload.get("title", "")), _canon_title(payload.get("title", "")),
_norm_text(payload.get("summary", "")), _norm_text(payload.get("summary", "")),
@ -166,11 +198,25 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
base = "|".join(_normalize(f) for f in fields) base = "|".join(_normalize(f) for f in fields)
return hashlib.sha256(base.encode("utf-8")).hexdigest() return hashlib.sha256(base.encode("utf-8")).hexdigest()
# ---- Feldauflösung (Synonyme) ----
def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) }
def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
m = _norm_keymap(d)
for c in candidates:
v = m.get(c)
if v not in (None, ""):
return v
return None
def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]: def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
# Exercise.capabilities erwartet Dict[str,int] # Exercise.capabilities erwartet Dict[str,int]
caps_list = raw.get("capabilities", []) caps_list = raw.get("capabilities", [])
capabilities = {} capabilities: Dict[str, int] = {}
for c in caps_list: for c in caps_list:
cap = c.get("capability") cap = c.get("capability")
lvl = c.get("level") lvl = c.get("level")
@ -180,47 +226,63 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
except Exception: except Exception:
pass pass
# Defaults/Fallbacks # Defaults/Fallbacks via Synonyme
try: # summary / execution
duration = float(raw.get("Dauer", 0) or 0) summary = _get_first(raw, KEYS_SUMMARY) or ""
except Exception: execution = _get_first(raw, KEYS_EXECUTION) or ""
duration = 0.0
keywords = [] # duration
kw_raw = raw.get("Schlüsselworte", "") duration = _get_first(raw, KEYS_DURATION)
try:
duration_f = float(duration or 0)
except Exception:
duration_f = 0.0
# keywords
kw_raw = _get_first(raw, KEYS_KEYWORDS)
keywords: List[str] = []
if isinstance(kw_raw, str): if isinstance(kw_raw, str):
keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] keywords = [k.strip() for k in kw_raw.split(",") if k.strip()]
equipment = [] # equipment
eq_raw = raw.get("equipment", []) eq_raw = _get_first(raw, KEYS_EQUIPMENT)
equipment: List[str] = []
if isinstance(eq_raw, str): if isinstance(eq_raw, str):
equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] equipment = [e.strip() for e in eq_raw.split(",") if e.strip()]
elif isinstance(eq_raw, list): elif isinstance(eq_raw, list):
equipment = [str(e).strip() for e in eq_raw if str(e).strip()] equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
notes = raw.get("Hinweise", "") or "" notes = _get_first(raw, KEYS_NOTES) or ""
if mutate: if mutate:
notes = (notes + " [auto-update]").strip() notes = (str(notes) + " [auto-update]").strip()
discipline = _get_first(raw, KEYS_DISCIPLINE) or ""
group = _get_first(raw, KEYS_GROUP) or None
age_group = _get_first(raw, KEYS_AGE_GROUP) or ""
target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
purpose = _get_first(raw, KEYS_PURPOSE) or ""
preparation = _get_first(raw, KEYS_PREPARATION) or ""
method = _get_first(raw, KEYS_METHOD) or ""
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"title": raw.get("title") or "", "title": raw.get("title") or "",
"summary": raw.get("Summary", "") or "", "summary": str(summary) or "",
"short_description": raw.get("Summary", "") or "", "short_description": str(summary) or "",
"keywords": keywords, "keywords": keywords,
"link": fullurl or "", "link": fullurl or "",
"discipline": raw.get("Übungstyp", "") or "", "discipline": str(discipline) or "",
"group": raw.get("Gruppengröße", "") or None, "group": str(group) if group else None,
"age_group": raw.get("Altersgruppe", "") or "", "age_group": str(age_group) or "",
"target_group": raw.get("Zielgruppe", "") or "", "target_group": str(target_group) or "",
"min_participants": 1, "min_participants": 1,
"duration_minutes": int(round(duration)), "duration_minutes": int(round(duration_f)),
"capabilities": capabilities, "capabilities": capabilities,
"category": category or "", "category": category or "",
"purpose": raw.get("Ziel", "") or "", "purpose": str(purpose) or "",
"execution": raw.get("Durchführung", "") or "", "execution": str(execution) or "",
"notes": notes, "notes": str(notes) or "",
"preparation": raw.get("RefMethode", "") or "", "preparation": str(preparation) or "",
"method": raw.get("method", "") or "", "method": str(method) or "",
"equipment": equipment, "equipment": equipment,
"fullurl": fullurl or "", "fullurl": fullurl or "",
"external_id": f"mw:{raw.get('pageid')}", "external_id": f"mw:{raw.get('pageid')}",
@ -229,6 +291,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
payload["fingerprint"] = compute_fingerprint(payload) payload["fingerprint"] = compute_fingerprint(payload)
return payload return payload
# ---- Lookup/Upsert ----
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]: def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
url = f"{EXERCISE_API}/by-external-id" url = f"{EXERCISE_API}/by-external-id"
@ -256,6 +319,37 @@ def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]:
} }
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
b = {k: before.get(k) for k in keys}
a = {k: after.get(k) for k in keys}
def _kws(x):
return sorted({(k or "").strip() for k in (x or [])}, key=str.casefold)
b_norm = {
"title": _canon_title(b.get("title")),
"summary": _norm_text(b.get("summary")),
"execution": _norm_text(b.get("execution")),
"notes": _norm_text(b.get("notes")),
"duration_minutes": b.get("duration_minutes"),
"capabilities": b.get("capabilities"),
"keywords": _kws(b.get("keywords")),
}
a_norm = {
"title": _canon_title(a.get("title")),
"summary": _norm_text(a.get("summary")),
"execution": _norm_text(a.get("execution")),
"notes": _norm_text(a.get("notes")),
"duration_minutes": a.get("duration_minutes"),
"capabilities": a.get("capabilities"),
"keywords": _kws(a.get("keywords")),
}
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
if diff:
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
else:
print("[Diff] (none in hash fields)")
def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
title = payload.get("title", "<ohne Titel>") title = payload.get("title", "<ohne Titel>")
ext_id = payload.get("external_id") ext_id = payload.get("external_id")
@ -280,7 +374,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
action, reason = "create", "unexpected lookup type" action, reason = "create", "unexpected lookup type"
if dry_run: if dry_run:
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) {reason}") print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}")
if action == "update": if action == "update":
_print_diff(found_payload, payload) _print_diff(found_payload, payload)
return action return action
@ -308,44 +402,12 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
pass pass
else: else:
resp.raise_for_status() resp.raise_for_status()
print(f"[Update] '{title}' {reason} -> OK") print(f"[Update] '{title}' - {reason} -> OK")
_print_diff(found_payload, payload) _print_diff(found_payload, payload)
else: else:
print(f"[Skip] '{title}' {reason}") print(f"[Skip] '{title}' - {reason}")
return action return action
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
"""Kleines Feld-Diff für die Hash-Felder (Diagnose)."""
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
b = {k: before.get(k) for k in keys}
a = {k: after.get(k) for k in keys}
# für bessere Lesbarkeit normalisieren wir die Textfelder
b_norm = {
"title": _canon_title(b.get("title")),
"summary": _norm_text(b.get("summary")),
"execution": _norm_text(b.get("execution")),
"notes": _norm_text(b.get("notes")),
"duration_minutes": b.get("duration_minutes"),
"capabilities": b.get("capabilities"),
"keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold),
}
a_norm = {
"title": _canon_title(a.get("title")),
"summary": _norm_text(a.get("summary")),
"execution": _norm_text(a.get("execution")),
"notes": _norm_text(a.get("notes")),
"duration_minutes": a.get("duration_minutes"),
"capabilities": a.get("capabilities"),
"keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold),
}
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
if diff:
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
else:
print("[Diff] (none in hash fields)")
# ----- Orchestrierung ----- # ----- Orchestrierung -----
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: