scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s
This commit is contained in:
parent
2567d8c786
commit
6bab3cdf04
|
|
@ -10,22 +10,18 @@ Beschreibung:
|
|||
- Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern
|
||||
- Smoke-Test (--smoke-test): 3 Läufe (create → skip → update)
|
||||
|
||||
v2.3.3 – Änderungen ggü. 2.3.2:
|
||||
- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung):
|
||||
• Titel: _ zu Leerzeichen, Gedankenstriche → Bindestrich
|
||||
• summary/execution/notes: Whitespace kollabieren
|
||||
• keywords: dedupliziert (case-insensitiv) & sortiert
|
||||
• duration_minutes: sicher als int
|
||||
- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload
|
||||
(verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint)
|
||||
- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update
|
||||
- Kein API-/CLI-Bruch
|
||||
v2.3.4 – Änderungen ggü. 2.3.3:
|
||||
- **Robuste Template-Erkennung**: Namen werden unicode-normalisiert & diakritik-insensitiv verglichen
|
||||
(z. B. "ÜbungInfoBox" == "UebungInfoBox" == "uebunginfobox").
|
||||
- **Feld-Synonyme & Key-Normalisierung**: "summary/execution/duration/keywords/..." werden über
|
||||
mehrere mögliche Parameternamen aufgelöst (z. B. Durchführung/Durchfuehrung/Ablauf).
|
||||
- Ziel: Verhindert leere Felder beim 2. Lauf und damit fälschliche Updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from typing import Dict, Any, Tuple, Optional
|
||||
from typing import Dict, Any, Tuple, Optional, List
|
||||
from collections.abc import Mapping
|
||||
import requests
|
||||
import mwparserfromhell
|
||||
|
|
@ -33,6 +29,7 @@ from dotenv import load_dotenv
|
|||
import hashlib
|
||||
import json
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
# ----- Konfiguration / Defaults -----
|
||||
load_dotenv() # .env laden, falls vorhanden
|
||||
|
|
@ -43,7 +40,41 @@ DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
|
|||
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
|
||||
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
|
||||
|
||||
# ---- Hilfsfunktionen für Wiki-Router ----
|
||||
# ---- Unicode-/Key-Normalisierung ----
|
||||
|
||||
def _norm_unicode(s: str) -> str:
|
||||
return unicodedata.normalize("NFKC", s)
|
||||
|
||||
def _strip_diacritics(s: str) -> str:
|
||||
return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
|
||||
|
||||
def _norm_key(s: str) -> str:
|
||||
s = _norm_unicode(s or "")
|
||||
s = _strip_diacritics(s)
|
||||
s = s.strip().casefold()
|
||||
return s
|
||||
|
||||
# Template-Aliasse (normalisierte Namen)
|
||||
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox"}
|
||||
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung"}
|
||||
TPL_SKILLDEV = {"skilldevelopment"}
|
||||
|
||||
# Synonyme für Parameter (normalisierte Keys)
|
||||
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
|
||||
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf"]
|
||||
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
|
||||
KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "keywords", "tags"]
|
||||
KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "material"]
|
||||
KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline"]
|
||||
KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"]
|
||||
KEYS_AGE_GROUP = ["altersgruppe"]
|
||||
KEYS_TARGET_GROUP = ["zielgruppe", "target_group"]
|
||||
KEYS_PURPOSE = ["ziel", "zweck", "purpose"]
|
||||
KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"]
|
||||
KEYS_METHOD = ["method", "methode"]
|
||||
KEYS_NOTES = ["hinweise", "notes"]
|
||||
|
||||
# ---- Hilfsfunktionen ----
|
||||
|
||||
def wiki_health() -> None:
|
||||
r = requests.get(f"{API_BASE_URL}/health", timeout=15)
|
||||
|
|
@ -91,30 +122,36 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
|||
wikicode = mwparserfromhell.parse(wikitext)
|
||||
|
||||
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
|
||||
|
||||
# Templates sammeln (robust gegen Varianten)
|
||||
for tpl in wikicode.filter_templates():
|
||||
name = str(tpl.name).strip()
|
||||
if name == "ÜbungInfoBox":
|
||||
name_raw = str(tpl.name)
|
||||
name_norm = _norm_key(name_raw)
|
||||
if name_norm in TPL_UEBUNG_INFOBOX:
|
||||
for p in tpl.params:
|
||||
raw[str(p.name).strip()] = str(p.value).strip()
|
||||
elif name == "Übungsbeschreibung":
|
||||
elif name_norm in TPL_UEBUNGSBESCHREIBUNG:
|
||||
for p in tpl.params:
|
||||
raw[str(p.name).strip()] = str(p.value).strip()
|
||||
elif name == "SkillDevelopment":
|
||||
elif name_norm in TPL_SKILLDEV:
|
||||
raw.setdefault("capabilities", [])
|
||||
# Standard-Keys (engl. Template)
|
||||
def _getp(t, k):
|
||||
try:
|
||||
return str(t.get(k).value).strip()
|
||||
except Exception:
|
||||
return ""
|
||||
cap = _getp(tpl, "PrimaryCapability")
|
||||
lvl = _getp(tpl, "CapabilityLevel")
|
||||
try:
|
||||
cap = str(tpl.get("PrimaryCapability").value).strip()
|
||||
lvl_i = int(lvl)
|
||||
except Exception:
|
||||
cap = ""
|
||||
try:
|
||||
lvl = int(str(tpl.get("CapabilityLevel").value).strip())
|
||||
except Exception:
|
||||
lvl = 0
|
||||
lvl_i = 0
|
||||
if cap:
|
||||
raw["capabilities"].append({"capability": cap, "level": lvl})
|
||||
raw["capabilities"].append({"capability": cap, "level": lvl_i})
|
||||
raw["wikitext"] = wikitext
|
||||
return raw
|
||||
|
||||
|
||||
# ---- Fingerprint-Unterstützung (stabil) ----
|
||||
|
||||
def _normalize(v: Any) -> str:
|
||||
|
|
@ -138,22 +175,17 @@ def _norm_text(s: str) -> str:
|
|||
|
||||
def _canon_title(t: str) -> str:
|
||||
t = (t or "").strip().replace("_", " ")
|
||||
# Gedankenstriche vereinheitlichen
|
||||
return t.replace("–", "-").replace("—", "-")
|
||||
|
||||
|
||||
def compute_fingerprint(payload: Dict[str, Any]) -> str:
|
||||
# keywords stabilisieren: trim, dedupe (case-insensitiv), sort
|
||||
kws = payload.get("keywords") or []
|
||||
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
|
||||
|
||||
# dauer als int
|
||||
dur = payload.get("duration_minutes") or 0
|
||||
try:
|
||||
dur = int(round(float(dur)))
|
||||
except Exception:
|
||||
dur = 0
|
||||
|
||||
fields = [
|
||||
_canon_title(payload.get("title", "")),
|
||||
_norm_text(payload.get("summary", "")),
|
||||
|
|
@ -166,11 +198,25 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
|
|||
base = "|".join(_normalize(f) for f in fields)
|
||||
return hashlib.sha256(base.encode("utf-8")).hexdigest()
|
||||
|
||||
# ---- Feldauflösung (Synonyme) ----
|
||||
|
||||
def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return { _norm_key(k): v for k, v in d.items() if isinstance(k, str) }
|
||||
|
||||
|
||||
def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
|
||||
m = _norm_keymap(d)
|
||||
for c in candidates:
|
||||
v = m.get(c)
|
||||
if v not in (None, ""):
|
||||
return v
|
||||
return None
|
||||
|
||||
|
||||
def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
|
||||
# Exercise.capabilities erwartet Dict[str,int]
|
||||
caps_list = raw.get("capabilities", [])
|
||||
capabilities = {}
|
||||
capabilities: Dict[str, int] = {}
|
||||
for c in caps_list:
|
||||
cap = c.get("capability")
|
||||
lvl = c.get("level")
|
||||
|
|
@ -180,47 +226,63 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# Defaults/Fallbacks
|
||||
try:
|
||||
duration = float(raw.get("Dauer", 0) or 0)
|
||||
except Exception:
|
||||
duration = 0.0
|
||||
# Defaults/Fallbacks via Synonyme
|
||||
# summary / execution
|
||||
summary = _get_first(raw, KEYS_SUMMARY) or ""
|
||||
execution = _get_first(raw, KEYS_EXECUTION) or ""
|
||||
|
||||
keywords = []
|
||||
kw_raw = raw.get("Schlüsselworte", "")
|
||||
# duration
|
||||
duration = _get_first(raw, KEYS_DURATION)
|
||||
try:
|
||||
duration_f = float(duration or 0)
|
||||
except Exception:
|
||||
duration_f = 0.0
|
||||
|
||||
# keywords
|
||||
kw_raw = _get_first(raw, KEYS_KEYWORDS)
|
||||
keywords: List[str] = []
|
||||
if isinstance(kw_raw, str):
|
||||
keywords = [k.strip() for k in kw_raw.split(",") if k.strip()]
|
||||
|
||||
equipment = []
|
||||
eq_raw = raw.get("equipment", [])
|
||||
# equipment
|
||||
eq_raw = _get_first(raw, KEYS_EQUIPMENT)
|
||||
equipment: List[str] = []
|
||||
if isinstance(eq_raw, str):
|
||||
equipment = [e.strip() for e in eq_raw.split(",") if e.strip()]
|
||||
elif isinstance(eq_raw, list):
|
||||
equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
|
||||
|
||||
notes = raw.get("Hinweise", "") or ""
|
||||
notes = _get_first(raw, KEYS_NOTES) or ""
|
||||
if mutate:
|
||||
notes = (notes + " [auto-update]").strip()
|
||||
notes = (str(notes) + " [auto-update]").strip()
|
||||
|
||||
discipline = _get_first(raw, KEYS_DISCIPLINE) or ""
|
||||
group = _get_first(raw, KEYS_GROUP) or None
|
||||
age_group = _get_first(raw, KEYS_AGE_GROUP) or ""
|
||||
target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
|
||||
purpose = _get_first(raw, KEYS_PURPOSE) or ""
|
||||
preparation = _get_first(raw, KEYS_PREPARATION) or ""
|
||||
method = _get_first(raw, KEYS_METHOD) or ""
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"title": raw.get("title") or "",
|
||||
"summary": raw.get("Summary", "") or "",
|
||||
"short_description": raw.get("Summary", "") or "",
|
||||
"summary": str(summary) or "",
|
||||
"short_description": str(summary) or "",
|
||||
"keywords": keywords,
|
||||
"link": fullurl or "",
|
||||
"discipline": raw.get("Übungstyp", "") or "",
|
||||
"group": raw.get("Gruppengröße", "") or None,
|
||||
"age_group": raw.get("Altersgruppe", "") or "",
|
||||
"target_group": raw.get("Zielgruppe", "") or "",
|
||||
"discipline": str(discipline) or "",
|
||||
"group": str(group) if group else None,
|
||||
"age_group": str(age_group) or "",
|
||||
"target_group": str(target_group) or "",
|
||||
"min_participants": 1,
|
||||
"duration_minutes": int(round(duration)),
|
||||
"duration_minutes": int(round(duration_f)),
|
||||
"capabilities": capabilities,
|
||||
"category": category or "",
|
||||
"purpose": raw.get("Ziel", "") or "",
|
||||
"execution": raw.get("Durchführung", "") or "",
|
||||
"notes": notes,
|
||||
"preparation": raw.get("RefMethode", "") or "",
|
||||
"method": raw.get("method", "") or "",
|
||||
"purpose": str(purpose) or "",
|
||||
"execution": str(execution) or "",
|
||||
"notes": str(notes) or "",
|
||||
"preparation": str(preparation) or "",
|
||||
"method": str(method) or "",
|
||||
"equipment": equipment,
|
||||
"fullurl": fullurl or "",
|
||||
"external_id": f"mw:{raw.get('pageid')}",
|
||||
|
|
@ -229,6 +291,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
|
|||
payload["fingerprint"] = compute_fingerprint(payload)
|
||||
return payload
|
||||
|
||||
# ---- Lookup/Upsert ----
|
||||
|
||||
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
|
||||
url = f"{EXERCISE_API}/by-external-id"
|
||||
|
|
@ -256,6 +319,37 @@ def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]:
|
|||
}
|
||||
|
||||
|
||||
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
|
||||
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
|
||||
b = {k: before.get(k) for k in keys}
|
||||
a = {k: after.get(k) for k in keys}
|
||||
def _kws(x):
|
||||
return sorted({(k or "").strip() for k in (x or [])}, key=str.casefold)
|
||||
b_norm = {
|
||||
"title": _canon_title(b.get("title")),
|
||||
"summary": _norm_text(b.get("summary")),
|
||||
"execution": _norm_text(b.get("execution")),
|
||||
"notes": _norm_text(b.get("notes")),
|
||||
"duration_minutes": b.get("duration_minutes"),
|
||||
"capabilities": b.get("capabilities"),
|
||||
"keywords": _kws(b.get("keywords")),
|
||||
}
|
||||
a_norm = {
|
||||
"title": _canon_title(a.get("title")),
|
||||
"summary": _norm_text(a.get("summary")),
|
||||
"execution": _norm_text(a.get("execution")),
|
||||
"notes": _norm_text(a.get("notes")),
|
||||
"duration_minutes": a.get("duration_minutes"),
|
||||
"capabilities": a.get("capabilities"),
|
||||
"keywords": _kws(a.get("keywords")),
|
||||
}
|
||||
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
|
||||
if diff:
|
||||
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
|
||||
else:
|
||||
print("[Diff] (none in hash fields)")
|
||||
|
||||
|
||||
def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
||||
title = payload.get("title", "<ohne Titel>")
|
||||
ext_id = payload.get("external_id")
|
||||
|
|
@ -280,7 +374,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
|||
action, reason = "create", "unexpected lookup type"
|
||||
|
||||
if dry_run:
|
||||
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}")
|
||||
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) - {reason}")
|
||||
if action == "update":
|
||||
_print_diff(found_payload, payload)
|
||||
return action
|
||||
|
|
@ -308,44 +402,12 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
|||
pass
|
||||
else:
|
||||
resp.raise_for_status()
|
||||
print(f"[Update] '{title}' – {reason} -> OK")
|
||||
print(f"[Update] '{title}' - {reason} -> OK")
|
||||
_print_diff(found_payload, payload)
|
||||
else:
|
||||
print(f"[Skip] '{title}' – {reason}")
|
||||
print(f"[Skip] '{title}' - {reason}")
|
||||
return action
|
||||
|
||||
|
||||
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
|
||||
"""Kleines Feld-Diff für die Hash-Felder (Diagnose)."""
|
||||
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
|
||||
b = {k: before.get(k) for k in keys}
|
||||
a = {k: after.get(k) for k in keys}
|
||||
# für bessere Lesbarkeit normalisieren wir die Textfelder
|
||||
b_norm = {
|
||||
"title": _canon_title(b.get("title")),
|
||||
"summary": _norm_text(b.get("summary")),
|
||||
"execution": _norm_text(b.get("execution")),
|
||||
"notes": _norm_text(b.get("notes")),
|
||||
"duration_minutes": b.get("duration_minutes"),
|
||||
"capabilities": b.get("capabilities"),
|
||||
"keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold),
|
||||
}
|
||||
a_norm = {
|
||||
"title": _canon_title(a.get("title")),
|
||||
"summary": _norm_text(a.get("summary")),
|
||||
"execution": _norm_text(a.get("execution")),
|
||||
"notes": _norm_text(a.get("notes")),
|
||||
"duration_minutes": a.get("duration_minutes"),
|
||||
"capabilities": a.get("capabilities"),
|
||||
"keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold),
|
||||
}
|
||||
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
|
||||
if diff:
|
||||
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
|
||||
else:
|
||||
print("[Diff] (none in hash fields)")
|
||||
|
||||
|
||||
# ----- Orchestrierung -----
|
||||
|
||||
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str:
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user