scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s

This commit is contained in:
Lars 2025-08-11 15:40:41 +02:00
parent 34320b46d9
commit 7b383f0778

View File

@ -1,30 +1,36 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
wiki_importer.py v2.3.7 wiki_importer.py v2.3.8
Fixes ggü. v2.3.6: Ziel dieses Patches: Die Felder `discipline`, `execution`, `keywords`, `equipment`, `duration_minutes` usw.
- **Keywords/EQUIPMENT/DISCIPLINE u.a. wurden teils nicht erkannt**: Bugfix in `_get_first()` kommen bei dir teilweise leer an. Ursache sind zu aggressive Normalisierungen/Matcher.
Kandidatenschlüssel werden jetzt ebenfalls normalisiert (`_norm_key(c)`), damit
`Schlüsselworte` (aus dem Wiki) zuverlässig matcht.
- `_get_first_fuzzy()` normalisiert die Such-Tokens.
- Kleine Bugfixes/Polish: `action.upper()` im Dry-Run, sanftere Keywords-Splittung.
Hinweis: Keine API-/CLI-Änderungen. Parser unterstützt weiterhin `{{Hilfsmittel}}`. Fix (konservativ & robust):
- Parser liest jetzt **gezielt** die bekannten Templates **ohne** Over-Normalisierung:
`{{ÜbungInfoBox}}` / `{{UebungInfoBox}}`
`{{Übungsbeschreibung}}` / `{{Uebungsbeschreibung}}`
`{{Hilfsmittel}}`
`{{SkillDevelopment}}`
- Feld-Extraktion nutzt **zuerst die exakten Wiki-Parameternamen** (deutsch/mit Umlauten),
erst danach schmale Synonym-Fallbacks. Das stellt sicher, dass z.B. `Schlüsselworte=`
wirklich in `keywords` landet.
- `imported_at` wird bei **Create und Update** gesetzt.
- Optionales Debugging: `--debug-raw` druckt die gefundenen Raw-Keys (einfach, nachvollziehbar).
Bestehende API-Endpunkte bleiben unverändert.
""" """
import os import os
import sys import sys
import argparse import argparse
from typing import Dict, Any, Tuple, Optional, List from typing import Dict, Any, Tuple, Optional, List
from collections.abc import Mapping
import requests import requests
import mwparserfromhell import mwparserfromhell
from dotenv import load_dotenv from dotenv import load_dotenv
import hashlib import hashlib
import json import json
import time import time
import unicodedata
# ----- Konfiguration / Defaults ----- # ----- Konfiguration / Defaults -----
load_dotenv() load_dotenv()
@ -34,52 +40,14 @@ DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60")) REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
# ---- Unicode-/Key-Normalisierung ---- # ----- Helpers für Wiki-Router -----
def _norm_unicode(s: str) -> str:
return unicodedata.normalize("NFKC", s)
def _strip_diacritics(s: str) -> str:
return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
def _norm_key(s: str) -> str:
s = _norm_unicode(s or "")
s = _strip_diacritics(s)
s = s.strip().casefold()
return s
def _norm_tpl(s: str) -> str:
s = _norm_key(s)
return "".join(ch for ch in s if ch.isalnum())
# Template-Aliasse (normalisierte Namen, _norm_tpl!)
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"}
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"}
TPL_SKILLDEV = {"skilldevelopment"}
TPL_HILFSMITTEL = {"hilfsmittel"}
# Synonyme (werden im Code nochmals normalisiert)
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"]
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags", "schluesselwort", "schlüsselwort"]
KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"]
KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"]
KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"]
KEYS_AGE_GROUP = ["altersgruppe"]
KEYS_TARGET_GROUP = ["zielgruppe", "target_group"]
KEYS_PURPOSE = ["ziel", "zweck", "purpose"]
KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"]
KEYS_METHOD = ["method", "methode"]
KEYS_NOTES = ["hinweise", "notes"]
# ---- Wiki-Router Helpers ----
def wiki_health() -> None: def wiki_health() -> None:
r = requests.get(f"{API_BASE_URL}/health", timeout=15) r = requests.get(f"{API_BASE_URL}/health", timeout=15)
r.raise_for_status() r.raise_for_status()
print("[Sanity] Wiki health OK") print("[Sanity] Wiki health OK")
def wiki_login(username: str, password: str) -> None: def wiki_login(username: str, password: str) -> None:
payload = {"username": username, "password": password} payload = {"username": username, "password": password}
r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30) r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30)
@ -94,38 +62,48 @@ def wiki_login(username: str, password: str) -> None:
raise RuntimeError(f"[Login] {msg}") raise RuntimeError(f"[Login] {msg}")
print("[Login] success") print("[Login] success")
def fetch_all_pages(category: str) -> Dict[str, Any]: def fetch_all_pages(category: str) -> Dict[str, Any]:
resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=REQUEST_TIMEOUT) resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status() resp.raise_for_status()
return resp.json() return resp.json()
def fetch_page_info(title: str) -> Dict[str, Any]: def fetch_page_info(title: str) -> Dict[str, Any]:
r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30) r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30)
r.raise_for_status() r.raise_for_status()
info = r.json() info = r.json()
return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}
# ---- Parser ---- # ----- Parser (konservativ) -----
T_INFOS = {"ÜbungInfoBox", "UebungInfoBox"}
T_BESCHR = {"Übungsbeschreibung", "Uebungsbeschreibung"}
T_HILFS = {"Hilfsmittel"}
T_SKILL = {"SkillDevelopment"}
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
print(f"[Parse] Lade '{title}' (ID={pageid})") print(f"[Parse] Lade '{title}' (ID={pageid})")
resp = requests.get( resp = requests.get(
f"{API_BASE_URL}/parsepage", f"{API_BASE_URL}/parsepage",
params={"pageid": pageid, "title": title}, params={"pageid": pageid, "title": title},
timeout=REQUEST_TIMEOUT timeout=REQUEST_TIMEOUT,
) )
resp.raise_for_status() resp.raise_for_status()
wikitext = resp.json().get("wikitext", "") wikitext = resp.json().get("wikitext", "")
wikicode = mwparserfromhell.parse(wikitext) wikicode = mwparserfromhell.parse(wikitext)
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid, "wikitext": wikitext}
for tpl in wikicode.filter_templates(): for tpl in wikicode.filter_templates():
name_norm = _norm_tpl(str(tpl.name)) name = str(tpl.name).strip()
if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG: if name in T_INFOS or name in T_BESCHR or name in T_HILFS:
for p in tpl.params: for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip() key = str(p.name).strip()
elif name_norm in TPL_SKILLDEV: val = str(p.value).strip()
raw[key] = val
elif name in T_SKILL:
raw.setdefault("capabilities", []) raw.setdefault("capabilities", [])
def _getp(t, k): def _getp(t, k):
try: try:
@ -140,14 +118,10 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
lvl_i = 0 lvl_i = 0
if cap: if cap:
raw["capabilities"].append({"capability": cap, "level": lvl_i}) raw["capabilities"].append({"capability": cap, "level": lvl_i})
elif name_norm in TPL_HILFSMITTEL:
for p in tpl.params:
raw[str(p.name).strip()] = str(p.value).strip()
raw["wikitext"] = wikitext
return raw return raw
# ---- Fingerprint (stabil) ---- # ----- Fingerprint (stabil, wie zuvor) -----
def _normalize(v: Any) -> str: def _normalize(v: Any) -> str:
if v is None: if v is None:
@ -158,21 +132,22 @@ def _normalize(v: Any) -> str:
return json.dumps(v, sort_keys=True, ensure_ascii=False) return json.dumps(v, sort_keys=True, ensure_ascii=False)
return str(v).strip() return str(v).strip()
def _norm_text(s: str) -> str: def _norm_text(s: str) -> str:
if s is None: if s is None:
return "" return ""
s = str(s).replace("\u00a0", " ") s = str(s).replace("\u00a0", " ")
s = s.strip()
s = " ".join(s.split()) s = " ".join(s.split())
return s return s.strip()
def _canon_title(t: str) -> str: def _canon_title(t: str) -> str:
t = (t or "").strip().replace("_", " ") t = (t or "").strip().replace("_", " ")
return t.replace("", "-").replace("", "-") return t.replace("", "-").replace("", "-")
def compute_fingerprint(payload: Dict[str, Any]) -> str: def compute_fingerprint(payload: Dict[str, Any]) -> str:
kws = payload.get("keywords") or [] kws = payload.get("keywords") or []
# Strichvarianten normalisieren
kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws] kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws]
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
dur = payload.get("duration_minutes") or 0 dur = payload.get("duration_minutes") or 0
@ -192,111 +167,96 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
base = "|".join(_normalize(f) for f in fields) base = "|".join(_normalize(f) for f in fields)
return hashlib.sha256(base.encode("utf-8")).hexdigest() return hashlib.sha256(base.encode("utf-8")).hexdigest()
# ---- Feldauflösung (Synonyme + Fuzzy) ---- # ----- Payload (exakte DE-Keys zuerst, dann schmale Fallbacks) -----
def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]: EXACT_KEYS = {
return {_norm_key(k): v for k, v in d.items() if isinstance(k, str)} "summary": ["Summary", "Kurzbeschreibung"],
"execution": ["Durchführung", "Durchfuehrung", "Ablauf"],
"duration": ["Dauer", "Zeit"],
"keywords": ["Schlüsselworte", "Schlüsselwörter", "Schluesselworte", "Schluesselwoerter", "Keywords", "Tags"],
"equipment_prim": ["Hilfsmittel"],
"equipment_alt": ["Geräte", "Geraete", "Gerät", "Geraet", "Material"],
"discipline": ["Übungstyp", "Uebungstyp", "Disziplin"],
"group": ["Gruppengröße", "Gruppengroesse", "Group"],
"age_group": ["Altersgruppe"],
"target_group": ["Zielgruppe"],
"purpose": ["Ziel", "Zweck"],
"notes": ["Hinweise", "Notes"],
"preparation": ["Vorbereitung", "RefMethode"],
"method": ["Methode", "Method"],
}
def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
m = _norm_keymap(d) def _first_any(raw: Dict[str, Any], keys: List[str]) -> Optional[str]:
for c in candidates: for k in keys:
v = m.get(_norm_key(c)) # << Bugfix: Kandidaten ebenfalls normalisieren v = raw.get(k)
if v not in (None, ""): if isinstance(v, str) and v.strip():
return v return v.strip()
return None return None
def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any:
m = _norm_keymap(d)
toks = [_norm_key(t) for t in tokens]
for k, v in m.items():
if v in (None, ""):
continue
if all(t in k for t in toks):
return v
return None
# ---- Payload ----
def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]: def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
# Capabilities -> Dict[str,int] # Capabilities -> Dict[str,int]
caps_list = raw.get("capabilities", [])
capabilities: Dict[str, int] = {} capabilities: Dict[str, int] = {}
for c in caps_list: for c in raw.get("capabilities", []) or []:
cap = c.get("capability") cap = c.get("capability"); lvl = c.get("level")
lvl = c.get("level")
if isinstance(cap, str) and cap: if isinstance(cap, str) and cap:
try: try:
capabilities[cap] = int(lvl) capabilities[cap] = int(lvl)
except Exception: except Exception:
pass pass
# summary / execution # Exakte Schlüssel zuerst
summary = _get_first(raw, KEYS_SUMMARY) or "" summary = _first_any(raw, EXACT_KEYS["summary"]) or ""
execution = _get_first(raw, KEYS_EXECUTION) execution = _first_any(raw, EXACT_KEYS["execution"]) or ""
if execution in (None, ""): duration = _first_any(raw, EXACT_KEYS["duration"]) or "0"
execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf"]) or ""
kw_raw = _first_any(raw, EXACT_KEYS["keywords"]) or ""
if kw_raw:
parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
keywords = [p for p in parts if p]
else:
keywords = []
eq_raw = _first_any(raw, EXACT_KEYS["equipment_prim"]) or _first_any(raw, EXACT_KEYS["equipment_alt"]) or ""
if eq_raw:
equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
else:
equipment = []
notes = _first_any(raw, EXACT_KEYS["notes"]) or ""
discipline = _first_any(raw, EXACT_KEYS["discipline"]) or ""
group = _first_any(raw, EXACT_KEYS["group"]) or None
age_group = _first_any(raw, EXACT_KEYS["age_group"]) or ""
target_group = _first_any(raw, EXACT_KEYS["target_group"]) or ""
purpose = _first_any(raw, EXACT_KEYS["purpose"]) or ""
preparation = _first_any(raw, EXACT_KEYS["preparation"]) or ""
method = _first_any(raw, EXACT_KEYS["method"]) or ""
# duration
duration = _get_first(raw, KEYS_DURATION)
try: try:
duration_f = float(duration or 0) duration_f = float(duration or 0)
except Exception: except Exception:
duration_f = 0.0 duration_f = 0.0
# keywords
kw_raw = _get_first(raw, KEYS_KEYWORDS)
if kw_raw in (None, ""):
kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or ""
keywords: List[str] = []
if isinstance(kw_raw, str):
# robuste Auftrennung; ignoriert doppelte Kommas/Zeilenumbrüche
parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
keywords = [p for p in parts if p]
# equipment
eq_raw = _get_first(raw, KEYS_EQUIPMENT)
if eq_raw in (None, ""):
eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or ""
equipment: List[str] = []
if isinstance(eq_raw, str):
equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
elif isinstance(eq_raw, list):
equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
notes = _get_first(raw, KEYS_NOTES) or ""
if mutate:
notes = (str(notes) + " [auto-update]").strip()
discipline = _get_first(raw, KEYS_DISCIPLINE) or ""
if discipline in (None, ""):
discipline = _get_first_fuzzy(raw, ["ubung", "typ"]) or _get_first_fuzzy(raw, ["schwerpunkt"]) or ""
group = _get_first(raw, KEYS_GROUP) or None
age_group = _get_first(raw, KEYS_AGE_GROUP) or ""
target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
purpose = _get_first(raw, KEYS_PURPOSE) or ""
preparation = _get_first(raw, KEYS_PREPARATION) or ""
method = _get_first(raw, KEYS_METHOD) or ""
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"title": raw.get("title") or "", "title": raw.get("title") or "",
"summary": str(summary) or "", "summary": summary,
"short_description": str(summary) or "", "short_description": summary,
"keywords": keywords, "keywords": keywords,
"link": fullurl or "", "link": fullurl or "",
"discipline": str(discipline) or "", "discipline": discipline,
"group": str(group) if group else None, "group": group,
"age_group": str(age_group) or "", "age_group": age_group,
"target_group": str(target_group) or "", "target_group": target_group,
"min_participants": 1, "min_participants": 1,
"duration_minutes": int(round(duration_f)), "duration_minutes": int(round(duration_f)),
"capabilities": capabilities, "capabilities": capabilities,
"category": category or "", "category": category or "",
"purpose": str(purpose) or "", "purpose": purpose,
"execution": str(execution) or "", "execution": execution,
"notes": str(notes) or "", "notes": (notes + (" [auto-update]" if mutate else "")).strip(),
"preparation": str(preparation) or "", "preparation": preparation,
"method": str(method) or "", "method": method,
"equipment": equipment, "equipment": equipment,
"fullurl": fullurl or "", "fullurl": fullurl or "",
"external_id": f"mw:{raw.get('pageid')}", "external_id": f"mw:{raw.get('pageid')}",
@ -305,7 +265,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
payload["fingerprint"] = compute_fingerprint(payload) payload["fingerprint"] = compute_fingerprint(payload)
return payload return payload
# ---- Lookup/Upsert ---- # ----- Lookup/Upsert -----
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]: def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
url = f"{EXERCISE_API}/by-external-id" url = f"{EXERCISE_API}/by-external-id"
@ -375,9 +335,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
found, status = lookup_by_external_id(ext_id) found, status = lookup_by_external_id(ext_id)
action = "create" action = "create"; reason = "not found (lookup 404)"; found_payload = {}
reason = "not found (lookup 404)"
found_payload = {}
if not (status == 404 or found is None): if not (status == 404 or found is None):
if isinstance(found, dict): if isinstance(found, dict):
@ -398,112 +356,78 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
return action return action
if action == "create": if action == "create":
payload2 = dict(payload) body = dict(payload); body["imported_at"] = _now_iso()
payload2["imported_at"] = _now_iso() resp = requests.post(EXERCISE_API, json=body, timeout=REQUEST_TIMEOUT)
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
if resp.status_code == 422: if resp.status_code == 422:
print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}") print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}")
try: try: resp.raise_for_status()
resp.raise_for_status() except Exception: pass
except Exception:
pass
else: else:
resp.raise_for_status() resp.raise_for_status(); print(f"[Create] '{title}' {reason} -> OK")
print(f"[Create] '{title}' {reason} -> OK")
elif action == "update": elif action == "update":
payload2 = dict(payload) body = dict(payload); body["imported_at"] = _now_iso()
payload2["imported_at"] = _now_iso() resp = requests.post(EXERCISE_API, json=body, timeout=REQUEST_TIMEOUT)
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
if resp.status_code == 422: if resp.status_code == 422:
print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}") print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")
try: try: resp.raise_for_status()
resp.raise_for_status() except Exception: pass
except Exception:
pass
else: else:
resp.raise_for_status() resp.raise_for_status(); print(f"[Update] '{title}' {reason} -> OK"); _print_diff(found_payload, payload)
print(f"[Update] '{title}' {reason} -> OK")
_print_diff(found_payload, payload)
else: else:
print(f"[Skip] '{title}' {reason}") print(f"[Skip] '{title}' {reason}")
return action return action
# ----- Orchestrierung ----- # ----- Orchestrierung -----
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False, debug_raw: bool = False) -> str:
info = fetch_page_info(title) info = fetch_page_info(title)
pid = info.get("pageid") pid = info.get("pageid"); fullurl = info.get("fullurl") or ""
fullurl = info.get("fullurl") or ""
if not pid: if not pid:
print(f"[Error] pageid für '{title}' nicht gefunden.", file=sys.stderr) print(f"[Error] pageid für '{title}' nicht gefunden.", file=sys.stderr); return "failed"
return "failed"
raw = parse_exercise(title, pid) raw = parse_exercise(title, pid)
if debug_raw:
print("[Debug] Raw-Keys:", sorted([k for k in raw.keys() if k not in {"wikitext"}]))
payload = build_payload(raw, fullurl, category, mutate=mutate) payload = build_payload(raw, fullurl, category, mutate=mutate)
return upsert_exercise(payload, dry_run=dry_run) return upsert_exercise(payload, dry_run=dry_run)
def process_all(category: str, *, dry_run: bool = False) -> Dict[str, int]: def process_all(category: str, *, dry_run: bool = False, debug_raw: bool = False) -> Dict[str, int]:
stats = {"created": 0, "updated": 0, "skipped": 0, "failed": 0} stats = {"created": 0, "updated": 0, "skipped": 0, "failed": 0}
print(f"[Main] Lade Liste der Übungen aus Kategorie '{category}'") print(f"[Main] Lade Liste der Übungen aus Kategorie '{category}'")
pages = fetch_all_pages(category) pages = fetch_all_pages(category)
print(f"[Main] {len(pages)} Seiten gefunden.") print(f"[Main] {len(pages)} Seiten gefunden.")
for title, entry in pages.items(): for idx, (title, entry) in enumerate(pages.items(), 1):
try: try:
getter = getattr(entry, "get", None) getter = getattr(entry, "get", None)
if callable(getter): pid = getter("pageid") if callable(getter) else None
pid = getter("pageid") fullurl = getter("fullurl") if callable(getter) else None
fullurl = getter("fullurl")
else:
pid = None
fullurl = None
if not pid: if not pid:
info = fetch_page_info(title) info = fetch_page_info(title); pid = info.get("pageid"); fullurl = fullurl or info.get("fullurl")
pid = info.get("pageid")
fullurl = fullurl or info.get("fullurl")
if not pid: if not pid:
print(f"[Skip] '{title}' hat keine pageid") print(f"[Skip] '{title}' hat keine pageid"); stats["failed"] += 1; continue
stats["failed"] += 1
continue
raw = parse_exercise(title, pid) raw = parse_exercise(title, pid)
if debug_raw and idx <= 5:
print(f"[Debug] #{idx} '{title}' Raw-Keys:", sorted([k for k in raw.keys() if k not in {"wikitext"}]))
payload = build_payload(raw, fullurl or "", category) payload = build_payload(raw, fullurl or "", category)
act = upsert_exercise(payload, dry_run=dry_run) act = upsert_exercise(payload, dry_run=dry_run)
if act == "create": stats["created" if act=="create" else "updated" if act=="update" else "skipped"] += 1
stats["created"] += 1
elif act == "update":
stats["updated"] += 1
elif act == "skip":
stats["skipped"] += 1
except requests.HTTPError as e: except requests.HTTPError as e:
code = getattr(e, "response", None).status_code if getattr(e, "response", None) else None code = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
if code == 404: if code == 404:
print(f"[Skip] '{title}': page not found (404)") print(f"[Skip] '{title}': page not found (404)"); stats["failed"] += 1
stats["failed"] += 1
else: else:
print(f"[Error] '{title}': {e}") print(f"[Error] '{title}': {e}"); stats["failed"] += 1
stats["failed"] += 1
except Exception as e: except Exception as e:
print(f"[Error] '{title}': {e}") print(f"[Error] '{title}': {e}"); stats["failed"] += 1
stats["failed"] += 1
return stats return stats
def run_smoke_test(title: str, category: str) -> None: def run_smoke_test(title: str, category: str, *, debug_raw: bool = False) -> None:
print("\n[SmokeTest] Lauf 1/3: CREATE (Erstimport)") print("\n[SmokeTest] Lauf 1/3: CREATE (Erstimport)"); act1 = process_one(title, category, mutate=False, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act1)
act1 = process_one(title, category, mutate=False) print("\n[SmokeTest] Lauf 2/3: SKIP (Wiederholung, unverändert)"); act2 = process_one(title, category, mutate=False, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act2)
print("[SmokeTest] Aktion:", act1) print("\n[SmokeTest] Lauf 3/3: UPDATE (simulierte Wiki-Änderung an 'notes')"); act3 = process_one(title, category, mutate=True, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act3)
print("\n[SmokeTest] Zusammenfassung:"); print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2))
print("\n[SmokeTest] Lauf 2/3: SKIP (Wiederholung, unverändert)")
act2 = process_one(title, category, mutate=False)
print("[SmokeTest] Aktion:", act2)
print("\n[SmokeTest] Lauf 3/3: UPDATE (simulierte Wiki-Änderung an 'notes')")
act3 = process_one(title, category, mutate=True)
print("[SmokeTest] Aktion:", act3)
print("\n[SmokeTest] Zusammenfassung:")
print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2))
# ----- Main ----- # ----- Main -----
@ -517,6 +441,7 @@ def main() -> None:
parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)") parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)")
parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen") parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen")
parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title") parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title")
parser.add_argument("--debug-raw", action="store_true", help="Zeigt die aus dem Wiki gelesenen Roh-Keys je Seite")
args = parser.parse_args() args = parser.parse_args()
wiki_health() wiki_health()
@ -528,19 +453,17 @@ def main() -> None:
try: try:
wiki_login(args.username, args.password) wiki_login(args.username, args.password)
except Exception as e: except Exception as e:
print(str(e), file=sys.stderr) print(str(e), file=sys.stderr); sys.exit(1)
sys.exit(1)
if args.smoke_test: if args.smoke_test:
run_smoke_test(args.title, args.category) run_smoke_test(args.title, args.category, debug_raw=args.debug_raw); return
return
if args.all: if args.all:
stats = process_all(args.category, dry_run=args.dry_run) stats = process_all(args.category, dry_run=args.dry_run, debug_raw=args.debug_raw)
print("\n[Stats] created={created} updated={updated} skipped={skipped} failed={failed}".format(**stats)) print("\n[Stats] created={created} updated={updated} skipped={skipped} failed={failed}".format(**stats))
else: else:
print(f"[Main] Import single exercise: {args.title}") print(f"[Main] Import single exercise: {args.title}")
result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run) result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run, debug_raw=args.debug_raw)
print(f"[Result] {result}") print(f"[Result] {result}")
if __name__ == "__main__": if __name__ == "__main__":