scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 1s
This commit is contained in:
parent
34320b46d9
commit
7b383f0778
|
|
@ -1,30 +1,36 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
wiki_importer.py – v2.3.7
|
wiki_importer.py – v2.3.8
|
||||||
|
|
||||||
Fixes ggü. v2.3.6:
|
Ziel dieses Patches: Die Felder `discipline`, `execution`, `keywords`, `equipment`, `duration_minutes` usw.
|
||||||
- **Keywords/EQUIPMENT/DISCIPLINE u.a. wurden teils nicht erkannt**: Bugfix in `_get_first()` –
|
kommen bei dir teilweise leer an. Ursache sind zu aggressive Normalisierungen/Matcher.
|
||||||
Kandidatenschlüssel werden jetzt ebenfalls normalisiert (`_norm_key(c)`), damit
|
|
||||||
`Schlüsselworte` (aus dem Wiki) zuverlässig matcht.
|
|
||||||
- `_get_first_fuzzy()` normalisiert die Such-Tokens.
|
|
||||||
- Kleine Bugfixes/Polish: `action.upper()` im Dry-Run, sanftere Keywords-Splittung.
|
|
||||||
|
|
||||||
Hinweis: Keine API-/CLI-Änderungen. Parser unterstützt weiterhin `{{Hilfsmittel}}`.
|
Fix (konservativ & robust):
|
||||||
|
- Parser liest jetzt **gezielt** die bekannten Templates **ohne** Over-Normalisierung:
|
||||||
|
• `{{ÜbungInfoBox}}` / `{{UebungInfoBox}}`
|
||||||
|
• `{{Übungsbeschreibung}}` / `{{Uebungsbeschreibung}}`
|
||||||
|
• `{{Hilfsmittel}}`
|
||||||
|
• `{{SkillDevelopment}}`
|
||||||
|
- Feld-Extraktion nutzt **zuerst die exakten Wiki-Parameternamen** (deutsch/mit Umlauten),
|
||||||
|
erst danach schmale Synonym-Fallbacks. Das stellt sicher, dass z. B. `Schlüsselworte=`
|
||||||
|
wirklich in `keywords` landet.
|
||||||
|
- `imported_at` wird bei **Create und Update** gesetzt.
|
||||||
|
- Optionales Debugging: `--debug-raw` druckt die gefundenen Raw-Keys (einfach, nachvollziehbar).
|
||||||
|
|
||||||
|
Bestehende API-Endpunkte bleiben unverändert.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
from typing import Dict, Any, Tuple, Optional, List
|
from typing import Dict, Any, Tuple, Optional, List
|
||||||
from collections.abc import Mapping
|
|
||||||
import requests
|
import requests
|
||||||
import mwparserfromhell
|
import mwparserfromhell
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
# ----- Konfiguration / Defaults -----
|
# ----- Konfiguration / Defaults -----
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
@ -34,52 +40,14 @@ DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
|
||||||
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
|
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
|
||||||
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
|
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
|
||||||
|
|
||||||
# ---- Unicode-/Key-Normalisierung ----
|
# ----- Helpers für Wiki-Router -----
|
||||||
|
|
||||||
def _norm_unicode(s: str) -> str:
|
|
||||||
return unicodedata.normalize("NFKC", s)
|
|
||||||
|
|
||||||
def _strip_diacritics(s: str) -> str:
|
|
||||||
return "".join(ch for ch in unicodedata.normalize("NFD", s) if not unicodedata.combining(ch))
|
|
||||||
|
|
||||||
def _norm_key(s: str) -> str:
|
|
||||||
s = _norm_unicode(s or "")
|
|
||||||
s = _strip_diacritics(s)
|
|
||||||
s = s.strip().casefold()
|
|
||||||
return s
|
|
||||||
|
|
||||||
def _norm_tpl(s: str) -> str:
|
|
||||||
s = _norm_key(s)
|
|
||||||
return "".join(ch for ch in s if ch.isalnum())
|
|
||||||
|
|
||||||
# Template-Aliasse (normalisierte Namen, _norm_tpl!)
|
|
||||||
TPL_UEBUNG_INFOBOX = {"ubunginfobox", "uebunginfobox", "ubunginfo", "uebunginfo"}
|
|
||||||
TPL_UEBUNGSBESCHREIBUNG = {"ubungsbeschreibung", "uebungsbeschreibung", "beschreibungubung", "beschreibunguebung"}
|
|
||||||
TPL_SKILLDEV = {"skilldevelopment"}
|
|
||||||
TPL_HILFSMITTEL = {"hilfsmittel"}
|
|
||||||
|
|
||||||
# Synonyme (werden im Code nochmals normalisiert)
|
|
||||||
KEYS_SUMMARY = ["summary", "kurzbeschreibung", "beschreibung", "kurztext"]
|
|
||||||
KEYS_EXECUTION = ["durchführung", "durchfuehrung", "ausführung", "ausfuehrung", "execution", "ablauf", "vorgehen"]
|
|
||||||
KEYS_DURATION = ["dauer", "zeit", "dauer_minuten", "dauer (min)", "minuten"]
|
|
||||||
KEYS_KEYWORDS = ["schlüsselworte", "schluesselworte", "schlüsselwörter", "schluesselwoerter", "keywords", "stichworte", "schlagworte", "tags", "schluesselwort", "schlüsselwort"]
|
|
||||||
KEYS_EQUIPMENT = ["equipment", "geräte", "geraete", "gerät", "geraet", "material", "hilfsmittel", "gerate/material"]
|
|
||||||
KEYS_DISCIPLINE = ["übungstyp", "uebungstyp", "discipline", "disziplin", "schwerpunkt", "bereich", "thema", "technik"]
|
|
||||||
KEYS_GROUP = ["gruppengröße", "gruppengroesse", "group"]
|
|
||||||
KEYS_AGE_GROUP = ["altersgruppe"]
|
|
||||||
KEYS_TARGET_GROUP = ["zielgruppe", "target_group"]
|
|
||||||
KEYS_PURPOSE = ["ziel", "zweck", "purpose"]
|
|
||||||
KEYS_PREPARATION = ["refmethode", "vorbereitung", "preparation"]
|
|
||||||
KEYS_METHOD = ["method", "methode"]
|
|
||||||
KEYS_NOTES = ["hinweise", "notes"]
|
|
||||||
|
|
||||||
# ---- Wiki-Router Helpers ----
|
|
||||||
|
|
||||||
def wiki_health() -> None:
|
def wiki_health() -> None:
|
||||||
r = requests.get(f"{API_BASE_URL}/health", timeout=15)
|
r = requests.get(f"{API_BASE_URL}/health", timeout=15)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
print("[Sanity] Wiki health OK")
|
print("[Sanity] Wiki health OK")
|
||||||
|
|
||||||
|
|
||||||
def wiki_login(username: str, password: str) -> None:
|
def wiki_login(username: str, password: str) -> None:
|
||||||
payload = {"username": username, "password": password}
|
payload = {"username": username, "password": password}
|
||||||
r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30)
|
r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30)
|
||||||
|
|
@ -94,38 +62,48 @@ def wiki_login(username: str, password: str) -> None:
|
||||||
raise RuntimeError(f"[Login] {msg}")
|
raise RuntimeError(f"[Login] {msg}")
|
||||||
print("[Login] success")
|
print("[Login] success")
|
||||||
|
|
||||||
|
|
||||||
def fetch_all_pages(category: str) -> Dict[str, Any]:
|
def fetch_all_pages(category: str) -> Dict[str, Any]:
|
||||||
resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=REQUEST_TIMEOUT)
|
resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=REQUEST_TIMEOUT)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
return resp.json()
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
def fetch_page_info(title: str) -> Dict[str, Any]:
|
def fetch_page_info(title: str) -> Dict[str, Any]:
|
||||||
r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30)
|
r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
info = r.json()
|
info = r.json()
|
||||||
return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}
|
return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}
|
||||||
|
|
||||||
# ---- Parser ----
|
# ----- Parser (konservativ) -----
|
||||||
|
|
||||||
|
T_INFOS = {"ÜbungInfoBox", "UebungInfoBox"}
|
||||||
|
T_BESCHR = {"Übungsbeschreibung", "Uebungsbeschreibung"}
|
||||||
|
T_HILFS = {"Hilfsmittel"}
|
||||||
|
T_SKILL = {"SkillDevelopment"}
|
||||||
|
|
||||||
|
|
||||||
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
||||||
print(f"[Parse] Lade '{title}' (ID={pageid})")
|
print(f"[Parse] Lade '{title}' (ID={pageid})")
|
||||||
resp = requests.get(
|
resp = requests.get(
|
||||||
f"{API_BASE_URL}/parsepage",
|
f"{API_BASE_URL}/parsepage",
|
||||||
params={"pageid": pageid, "title": title},
|
params={"pageid": pageid, "title": title},
|
||||||
timeout=REQUEST_TIMEOUT
|
timeout=REQUEST_TIMEOUT,
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
wikitext = resp.json().get("wikitext", "")
|
wikitext = resp.json().get("wikitext", "")
|
||||||
wikicode = mwparserfromhell.parse(wikitext)
|
wikicode = mwparserfromhell.parse(wikitext)
|
||||||
|
|
||||||
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
|
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid, "wikitext": wikitext}
|
||||||
|
|
||||||
for tpl in wikicode.filter_templates():
|
for tpl in wikicode.filter_templates():
|
||||||
name_norm = _norm_tpl(str(tpl.name))
|
name = str(tpl.name).strip()
|
||||||
if name_norm in TPL_UEBUNG_INFOBOX or name_norm in TPL_UEBUNGSBESCHREIBUNG:
|
if name in T_INFOS or name in T_BESCHR or name in T_HILFS:
|
||||||
for p in tpl.params:
|
for p in tpl.params:
|
||||||
raw[str(p.name).strip()] = str(p.value).strip()
|
key = str(p.name).strip()
|
||||||
elif name_norm in TPL_SKILLDEV:
|
val = str(p.value).strip()
|
||||||
|
raw[key] = val
|
||||||
|
elif name in T_SKILL:
|
||||||
raw.setdefault("capabilities", [])
|
raw.setdefault("capabilities", [])
|
||||||
def _getp(t, k):
|
def _getp(t, k):
|
||||||
try:
|
try:
|
||||||
|
|
@ -140,14 +118,10 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
||||||
lvl_i = 0
|
lvl_i = 0
|
||||||
if cap:
|
if cap:
|
||||||
raw["capabilities"].append({"capability": cap, "level": lvl_i})
|
raw["capabilities"].append({"capability": cap, "level": lvl_i})
|
||||||
elif name_norm in TPL_HILFSMITTEL:
|
|
||||||
for p in tpl.params:
|
|
||||||
raw[str(p.name).strip()] = str(p.value).strip()
|
|
||||||
|
|
||||||
raw["wikitext"] = wikitext
|
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
# ---- Fingerprint (stabil) ----
|
# ----- Fingerprint (stabil, wie zuvor) -----
|
||||||
|
|
||||||
def _normalize(v: Any) -> str:
|
def _normalize(v: Any) -> str:
|
||||||
if v is None:
|
if v is None:
|
||||||
|
|
@ -158,21 +132,22 @@ def _normalize(v: Any) -> str:
|
||||||
return json.dumps(v, sort_keys=True, ensure_ascii=False)
|
return json.dumps(v, sort_keys=True, ensure_ascii=False)
|
||||||
return str(v).strip()
|
return str(v).strip()
|
||||||
|
|
||||||
|
|
||||||
def _norm_text(s: str) -> str:
|
def _norm_text(s: str) -> str:
|
||||||
if s is None:
|
if s is None:
|
||||||
return ""
|
return ""
|
||||||
s = str(s).replace("\u00a0", " ")
|
s = str(s).replace("\u00a0", " ")
|
||||||
s = s.strip()
|
|
||||||
s = " ".join(s.split())
|
s = " ".join(s.split())
|
||||||
return s
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
def _canon_title(t: str) -> str:
|
def _canon_title(t: str) -> str:
|
||||||
t = (t or "").strip().replace("_", " ")
|
t = (t or "").strip().replace("_", " ")
|
||||||
return t.replace("–", "-").replace("—", "-")
|
return t.replace("–", "-").replace("—", "-")
|
||||||
|
|
||||||
|
|
||||||
def compute_fingerprint(payload: Dict[str, Any]) -> str:
|
def compute_fingerprint(payload: Dict[str, Any]) -> str:
|
||||||
kws = payload.get("keywords") or []
|
kws = payload.get("keywords") or []
|
||||||
# Strichvarianten normalisieren
|
|
||||||
kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws]
|
kws = [k.replace("\u2013", "-").replace("\u2014", "-") for k in kws]
|
||||||
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
|
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
|
||||||
dur = payload.get("duration_minutes") or 0
|
dur = payload.get("duration_minutes") or 0
|
||||||
|
|
@ -192,111 +167,96 @@ def compute_fingerprint(payload: Dict[str, Any]) -> str:
|
||||||
base = "|".join(_normalize(f) for f in fields)
|
base = "|".join(_normalize(f) for f in fields)
|
||||||
return hashlib.sha256(base.encode("utf-8")).hexdigest()
|
return hashlib.sha256(base.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
# ---- Feldauflösung (Synonyme + Fuzzy) ----
|
# ----- Payload (exakte DE-Keys zuerst, dann schmale Fallbacks) -----
|
||||||
|
|
||||||
def _norm_keymap(d: Dict[str, Any]) -> Dict[str, Any]:
|
EXACT_KEYS = {
|
||||||
return {_norm_key(k): v for k, v in d.items() if isinstance(k, str)}
|
"summary": ["Summary", "Kurzbeschreibung"],
|
||||||
|
"execution": ["Durchführung", "Durchfuehrung", "Ablauf"],
|
||||||
|
"duration": ["Dauer", "Zeit"],
|
||||||
|
"keywords": ["Schlüsselworte", "Schlüsselwörter", "Schluesselworte", "Schluesselwoerter", "Keywords", "Tags"],
|
||||||
|
"equipment_prim": ["Hilfsmittel"],
|
||||||
|
"equipment_alt": ["Geräte", "Geraete", "Gerät", "Geraet", "Material"],
|
||||||
|
"discipline": ["Übungstyp", "Uebungstyp", "Disziplin"],
|
||||||
|
"group": ["Gruppengröße", "Gruppengroesse", "Group"],
|
||||||
|
"age_group": ["Altersgruppe"],
|
||||||
|
"target_group": ["Zielgruppe"],
|
||||||
|
"purpose": ["Ziel", "Zweck"],
|
||||||
|
"notes": ["Hinweise", "Notes"],
|
||||||
|
"preparation": ["Vorbereitung", "RefMethode"],
|
||||||
|
"method": ["Methode", "Method"],
|
||||||
|
}
|
||||||
|
|
||||||
def _get_first(d: Dict[str, Any], candidates: List[str]) -> Any:
|
|
||||||
m = _norm_keymap(d)
|
def _first_any(raw: Dict[str, Any], keys: List[str]) -> Optional[str]:
|
||||||
for c in candidates:
|
for k in keys:
|
||||||
v = m.get(_norm_key(c)) # << Bugfix: Kandidaten ebenfalls normalisieren
|
v = raw.get(k)
|
||||||
if v not in (None, ""):
|
if isinstance(v, str) and v.strip():
|
||||||
return v
|
return v.strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _get_first_fuzzy(d: Dict[str, Any], tokens: List[str]) -> Any:
|
|
||||||
m = _norm_keymap(d)
|
|
||||||
toks = [_norm_key(t) for t in tokens]
|
|
||||||
for k, v in m.items():
|
|
||||||
if v in (None, ""):
|
|
||||||
continue
|
|
||||||
if all(t in k for t in toks):
|
|
||||||
return v
|
|
||||||
return None
|
|
||||||
|
|
||||||
# ---- Payload ----
|
|
||||||
|
|
||||||
def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
|
def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: bool = False) -> Dict[str, Any]:
|
||||||
# Capabilities -> Dict[str,int]
|
# Capabilities -> Dict[str,int]
|
||||||
caps_list = raw.get("capabilities", [])
|
|
||||||
capabilities: Dict[str, int] = {}
|
capabilities: Dict[str, int] = {}
|
||||||
for c in caps_list:
|
for c in raw.get("capabilities", []) or []:
|
||||||
cap = c.get("capability")
|
cap = c.get("capability"); lvl = c.get("level")
|
||||||
lvl = c.get("level")
|
|
||||||
if isinstance(cap, str) and cap:
|
if isinstance(cap, str) and cap:
|
||||||
try:
|
try:
|
||||||
capabilities[cap] = int(lvl)
|
capabilities[cap] = int(lvl)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# summary / execution
|
# Exakte Schlüssel zuerst
|
||||||
summary = _get_first(raw, KEYS_SUMMARY) or ""
|
summary = _first_any(raw, EXACT_KEYS["summary"]) or ""
|
||||||
execution = _get_first(raw, KEYS_EXECUTION)
|
execution = _first_any(raw, EXACT_KEYS["execution"]) or ""
|
||||||
if execution in (None, ""):
|
duration = _first_any(raw, EXACT_KEYS["duration"]) or "0"
|
||||||
execution = _get_first_fuzzy(raw, ["ablauf"]) or _get_first_fuzzy(raw, ["durchf"]) or ""
|
|
||||||
|
kw_raw = _first_any(raw, EXACT_KEYS["keywords"]) or ""
|
||||||
|
if kw_raw:
|
||||||
|
parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
|
||||||
|
keywords = [p for p in parts if p]
|
||||||
|
else:
|
||||||
|
keywords = []
|
||||||
|
|
||||||
|
eq_raw = _first_any(raw, EXACT_KEYS["equipment_prim"]) or _first_any(raw, EXACT_KEYS["equipment_alt"]) or ""
|
||||||
|
if eq_raw:
|
||||||
|
equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
|
||||||
|
else:
|
||||||
|
equipment = []
|
||||||
|
|
||||||
|
notes = _first_any(raw, EXACT_KEYS["notes"]) or ""
|
||||||
|
discipline = _first_any(raw, EXACT_KEYS["discipline"]) or ""
|
||||||
|
group = _first_any(raw, EXACT_KEYS["group"]) or None
|
||||||
|
age_group = _first_any(raw, EXACT_KEYS["age_group"]) or ""
|
||||||
|
target_group = _first_any(raw, EXACT_KEYS["target_group"]) or ""
|
||||||
|
purpose = _first_any(raw, EXACT_KEYS["purpose"]) or ""
|
||||||
|
preparation = _first_any(raw, EXACT_KEYS["preparation"]) or ""
|
||||||
|
method = _first_any(raw, EXACT_KEYS["method"]) or ""
|
||||||
|
|
||||||
# duration
|
|
||||||
duration = _get_first(raw, KEYS_DURATION)
|
|
||||||
try:
|
try:
|
||||||
duration_f = float(duration or 0)
|
duration_f = float(duration or 0)
|
||||||
except Exception:
|
except Exception:
|
||||||
duration_f = 0.0
|
duration_f = 0.0
|
||||||
|
|
||||||
# keywords
|
|
||||||
kw_raw = _get_first(raw, KEYS_KEYWORDS)
|
|
||||||
if kw_raw in (None, ""):
|
|
||||||
kw_raw = _get_first_fuzzy(raw, ["stich", "worte"]) or _get_first_fuzzy(raw, ["schlag", "worte"]) or ""
|
|
||||||
keywords: List[str] = []
|
|
||||||
if isinstance(kw_raw, str):
|
|
||||||
# robuste Auftrennung; ignoriert doppelte Kommas/Zeilenumbrüche
|
|
||||||
parts = [p.strip() for p in kw_raw.replace("\n", ",").split(",")]
|
|
||||||
keywords = [p for p in parts if p]
|
|
||||||
|
|
||||||
# equipment
|
|
||||||
eq_raw = _get_first(raw, KEYS_EQUIPMENT)
|
|
||||||
if eq_raw in (None, ""):
|
|
||||||
eq_raw = _get_first_fuzzy(raw, ["gerate", "material"]) or _get_first_fuzzy(raw, ["hilfsmittel"]) or ""
|
|
||||||
equipment: List[str] = []
|
|
||||||
if isinstance(eq_raw, str):
|
|
||||||
equipment = [e.strip() for e in eq_raw.replace("\n", ",").split(",") if e.strip()]
|
|
||||||
elif isinstance(eq_raw, list):
|
|
||||||
equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
|
|
||||||
|
|
||||||
notes = _get_first(raw, KEYS_NOTES) or ""
|
|
||||||
if mutate:
|
|
||||||
notes = (str(notes) + " [auto-update]").strip()
|
|
||||||
|
|
||||||
discipline = _get_first(raw, KEYS_DISCIPLINE) or ""
|
|
||||||
if discipline in (None, ""):
|
|
||||||
discipline = _get_first_fuzzy(raw, ["ubung", "typ"]) or _get_first_fuzzy(raw, ["schwerpunkt"]) or ""
|
|
||||||
|
|
||||||
group = _get_first(raw, KEYS_GROUP) or None
|
|
||||||
age_group = _get_first(raw, KEYS_AGE_GROUP) or ""
|
|
||||||
target_group = _get_first(raw, KEYS_TARGET_GROUP) or ""
|
|
||||||
purpose = _get_first(raw, KEYS_PURPOSE) or ""
|
|
||||||
preparation = _get_first(raw, KEYS_PREPARATION) or ""
|
|
||||||
method = _get_first(raw, KEYS_METHOD) or ""
|
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
payload: Dict[str, Any] = {
|
||||||
"title": raw.get("title") or "",
|
"title": raw.get("title") or "",
|
||||||
"summary": str(summary) or "",
|
"summary": summary,
|
||||||
"short_description": str(summary) or "",
|
"short_description": summary,
|
||||||
"keywords": keywords,
|
"keywords": keywords,
|
||||||
"link": fullurl or "",
|
"link": fullurl or "",
|
||||||
"discipline": str(discipline) or "",
|
"discipline": discipline,
|
||||||
"group": str(group) if group else None,
|
"group": group,
|
||||||
"age_group": str(age_group) or "",
|
"age_group": age_group,
|
||||||
"target_group": str(target_group) or "",
|
"target_group": target_group,
|
||||||
"min_participants": 1,
|
"min_participants": 1,
|
||||||
"duration_minutes": int(round(duration_f)),
|
"duration_minutes": int(round(duration_f)),
|
||||||
"capabilities": capabilities,
|
"capabilities": capabilities,
|
||||||
"category": category or "",
|
"category": category or "",
|
||||||
"purpose": str(purpose) or "",
|
"purpose": purpose,
|
||||||
"execution": str(execution) or "",
|
"execution": execution,
|
||||||
"notes": str(notes) or "",
|
"notes": (notes + (" [auto-update]" if mutate else "")).strip(),
|
||||||
"preparation": str(preparation) or "",
|
"preparation": preparation,
|
||||||
"method": str(method) or "",
|
"method": method,
|
||||||
"equipment": equipment,
|
"equipment": equipment,
|
||||||
"fullurl": fullurl or "",
|
"fullurl": fullurl or "",
|
||||||
"external_id": f"mw:{raw.get('pageid')}",
|
"external_id": f"mw:{raw.get('pageid')}",
|
||||||
|
|
@ -305,7 +265,7 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
|
||||||
payload["fingerprint"] = compute_fingerprint(payload)
|
payload["fingerprint"] = compute_fingerprint(payload)
|
||||||
return payload
|
return payload
|
||||||
|
|
||||||
# ---- Lookup/Upsert ----
|
# ----- Lookup/Upsert -----
|
||||||
|
|
||||||
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
|
def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[int]]:
|
||||||
url = f"{EXERCISE_API}/by-external-id"
|
url = f"{EXERCISE_API}/by-external-id"
|
||||||
|
|
@ -375,9 +335,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
||||||
|
|
||||||
found, status = lookup_by_external_id(ext_id)
|
found, status = lookup_by_external_id(ext_id)
|
||||||
|
|
||||||
action = "create"
|
action = "create"; reason = "not found (lookup 404)"; found_payload = {}
|
||||||
reason = "not found (lookup 404)"
|
|
||||||
found_payload = {}
|
|
||||||
|
|
||||||
if not (status == 404 or found is None):
|
if not (status == 404 or found is None):
|
||||||
if isinstance(found, dict):
|
if isinstance(found, dict):
|
||||||
|
|
@ -398,112 +356,78 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
||||||
return action
|
return action
|
||||||
|
|
||||||
if action == "create":
|
if action == "create":
|
||||||
payload2 = dict(payload)
|
body = dict(payload); body["imported_at"] = _now_iso()
|
||||||
payload2["imported_at"] = _now_iso()
|
resp = requests.post(EXERCISE_API, json=body, timeout=REQUEST_TIMEOUT)
|
||||||
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
|
|
||||||
if resp.status_code == 422:
|
if resp.status_code == 422:
|
||||||
print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}")
|
print(f"[Create] '{title}' -> FAILED 422:\n{resp.text}")
|
||||||
try:
|
try: resp.raise_for_status()
|
||||||
resp.raise_for_status()
|
except Exception: pass
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
resp.raise_for_status()
|
resp.raise_for_status(); print(f"[Create] '{title}' – {reason} -> OK")
|
||||||
print(f"[Create] '{title}' – {reason} -> OK")
|
|
||||||
elif action == "update":
|
elif action == "update":
|
||||||
payload2 = dict(payload)
|
body = dict(payload); body["imported_at"] = _now_iso()
|
||||||
payload2["imported_at"] = _now_iso()
|
resp = requests.post(EXERCISE_API, json=body, timeout=REQUEST_TIMEOUT)
|
||||||
resp = requests.post(EXERCISE_API, json=payload2, timeout=REQUEST_TIMEOUT)
|
|
||||||
if resp.status_code == 422:
|
if resp.status_code == 422:
|
||||||
print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")
|
print(f"[Update] '{title}' -> FAILED 422:\n{resp.text}")
|
||||||
try:
|
try: resp.raise_for_status()
|
||||||
resp.raise_for_status()
|
except Exception: pass
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
resp.raise_for_status()
|
resp.raise_for_status(); print(f"[Update] '{title}' – {reason} -> OK"); _print_diff(found_payload, payload)
|
||||||
print(f"[Update] '{title}' – {reason} -> OK")
|
|
||||||
_print_diff(found_payload, payload)
|
|
||||||
else:
|
else:
|
||||||
print(f"[Skip] '{title}' – {reason}")
|
print(f"[Skip] '{title}' – {reason}")
|
||||||
return action
|
return action
|
||||||
|
|
||||||
# ----- Orchestrierung -----
|
# ----- Orchestrierung -----
|
||||||
|
|
||||||
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str:
|
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False, debug_raw: bool = False) -> str:
|
||||||
info = fetch_page_info(title)
|
info = fetch_page_info(title)
|
||||||
pid = info.get("pageid")
|
pid = info.get("pageid"); fullurl = info.get("fullurl") or ""
|
||||||
fullurl = info.get("fullurl") or ""
|
|
||||||
if not pid:
|
if not pid:
|
||||||
print(f"[Error] pageid für '{title}' nicht gefunden.", file=sys.stderr)
|
print(f"[Error] pageid für '{title}' nicht gefunden.", file=sys.stderr); return "failed"
|
||||||
return "failed"
|
|
||||||
raw = parse_exercise(title, pid)
|
raw = parse_exercise(title, pid)
|
||||||
|
if debug_raw:
|
||||||
|
print("[Debug] Raw-Keys:", sorted([k for k in raw.keys() if k not in {"wikitext"}]))
|
||||||
payload = build_payload(raw, fullurl, category, mutate=mutate)
|
payload = build_payload(raw, fullurl, category, mutate=mutate)
|
||||||
return upsert_exercise(payload, dry_run=dry_run)
|
return upsert_exercise(payload, dry_run=dry_run)
|
||||||
|
|
||||||
|
|
||||||
def process_all(category: str, *, dry_run: bool = False) -> Dict[str, int]:
|
def process_all(category: str, *, dry_run: bool = False, debug_raw: bool = False) -> Dict[str, int]:
|
||||||
stats = {"created": 0, "updated": 0, "skipped": 0, "failed": 0}
|
stats = {"created": 0, "updated": 0, "skipped": 0, "failed": 0}
|
||||||
print(f"[Main] Lade Liste der Übungen aus Kategorie '{category}'…")
|
print(f"[Main] Lade Liste der Übungen aus Kategorie '{category}'…")
|
||||||
pages = fetch_all_pages(category)
|
pages = fetch_all_pages(category)
|
||||||
print(f"[Main] {len(pages)} Seiten gefunden.")
|
print(f"[Main] {len(pages)} Seiten gefunden.")
|
||||||
|
|
||||||
for title, entry in pages.items():
|
for idx, (title, entry) in enumerate(pages.items(), 1):
|
||||||
try:
|
try:
|
||||||
getter = getattr(entry, "get", None)
|
getter = getattr(entry, "get", None)
|
||||||
if callable(getter):
|
pid = getter("pageid") if callable(getter) else None
|
||||||
pid = getter("pageid")
|
fullurl = getter("fullurl") if callable(getter) else None
|
||||||
fullurl = getter("fullurl")
|
|
||||||
else:
|
|
||||||
pid = None
|
|
||||||
fullurl = None
|
|
||||||
|
|
||||||
if not pid:
|
if not pid:
|
||||||
info = fetch_page_info(title)
|
info = fetch_page_info(title); pid = info.get("pageid"); fullurl = fullurl or info.get("fullurl")
|
||||||
pid = info.get("pageid")
|
|
||||||
fullurl = fullurl or info.get("fullurl")
|
|
||||||
if not pid:
|
if not pid:
|
||||||
print(f"[Skip] '{title}' hat keine pageid")
|
print(f"[Skip] '{title}' hat keine pageid"); stats["failed"] += 1; continue
|
||||||
stats["failed"] += 1
|
|
||||||
continue
|
|
||||||
raw = parse_exercise(title, pid)
|
raw = parse_exercise(title, pid)
|
||||||
|
if debug_raw and idx <= 5:
|
||||||
|
print(f"[Debug] #{idx} '{title}' Raw-Keys:", sorted([k for k in raw.keys() if k not in {"wikitext"}]))
|
||||||
payload = build_payload(raw, fullurl or "", category)
|
payload = build_payload(raw, fullurl or "", category)
|
||||||
act = upsert_exercise(payload, dry_run=dry_run)
|
act = upsert_exercise(payload, dry_run=dry_run)
|
||||||
if act == "create":
|
stats["created" if act=="create" else "updated" if act=="update" else "skipped"] += 1
|
||||||
stats["created"] += 1
|
|
||||||
elif act == "update":
|
|
||||||
stats["updated"] += 1
|
|
||||||
elif act == "skip":
|
|
||||||
stats["skipped"] += 1
|
|
||||||
except requests.HTTPError as e:
|
except requests.HTTPError as e:
|
||||||
code = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
|
code = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
|
||||||
if code == 404:
|
if code == 404:
|
||||||
print(f"[Skip] '{title}': page not found (404)")
|
print(f"[Skip] '{title}': page not found (404)"); stats["failed"] += 1
|
||||||
stats["failed"] += 1
|
|
||||||
else:
|
else:
|
||||||
print(f"[Error] '{title}': {e}")
|
print(f"[Error] '{title}': {e}"); stats["failed"] += 1
|
||||||
stats["failed"] += 1
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[Error] '{title}': {e}")
|
print(f"[Error] '{title}': {e}"); stats["failed"] += 1
|
||||||
stats["failed"] += 1
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
|
|
||||||
def run_smoke_test(title: str, category: str) -> None:
|
def run_smoke_test(title: str, category: str, *, debug_raw: bool = False) -> None:
|
||||||
print("\n[SmokeTest] Lauf 1/3: CREATE (Erstimport)")
|
print("\n[SmokeTest] Lauf 1/3: CREATE (Erstimport)"); act1 = process_one(title, category, mutate=False, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act1)
|
||||||
act1 = process_one(title, category, mutate=False)
|
print("\n[SmokeTest] Lauf 2/3: SKIP (Wiederholung, unverändert)"); act2 = process_one(title, category, mutate=False, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act2)
|
||||||
print("[SmokeTest] Aktion:", act1)
|
print("\n[SmokeTest] Lauf 3/3: UPDATE (simulierte Wiki-Änderung an 'notes')"); act3 = process_one(title, category, mutate=True, debug_raw=debug_raw); print("[SmokeTest] Aktion:", act3)
|
||||||
|
print("\n[SmokeTest] Zusammenfassung:"); print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2))
|
||||||
print("\n[SmokeTest] Lauf 2/3: SKIP (Wiederholung, unverändert)")
|
|
||||||
act2 = process_one(title, category, mutate=False)
|
|
||||||
print("[SmokeTest] Aktion:", act2)
|
|
||||||
|
|
||||||
print("\n[SmokeTest] Lauf 3/3: UPDATE (simulierte Wiki-Änderung an 'notes')")
|
|
||||||
act3 = process_one(title, category, mutate=True)
|
|
||||||
print("[SmokeTest] Aktion:", act3)
|
|
||||||
|
|
||||||
print("\n[SmokeTest] Zusammenfassung:")
|
|
||||||
print(json.dumps({"run1": act1, "run2": act2, "run3": act3}, ensure_ascii=False, indent=2))
|
|
||||||
|
|
||||||
# ----- Main -----
|
# ----- Main -----
|
||||||
|
|
||||||
|
|
@ -517,6 +441,7 @@ def main() -> None:
|
||||||
parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)")
|
parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)")
|
||||||
parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen")
|
parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen")
|
||||||
parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title")
|
parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title")
|
||||||
|
parser.add_argument("--debug-raw", action="store_true", help="Zeigt die aus dem Wiki gelesenen Roh-Keys je Seite")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
wiki_health()
|
wiki_health()
|
||||||
|
|
@ -528,19 +453,17 @@ def main() -> None:
|
||||||
try:
|
try:
|
||||||
wiki_login(args.username, args.password)
|
wiki_login(args.username, args.password)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(str(e), file=sys.stderr)
|
print(str(e), file=sys.stderr); sys.exit(1)
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if args.smoke_test:
|
if args.smoke_test:
|
||||||
run_smoke_test(args.title, args.category)
|
run_smoke_test(args.title, args.category, debug_raw=args.debug_raw); return
|
||||||
return
|
|
||||||
|
|
||||||
if args.all:
|
if args.all:
|
||||||
stats = process_all(args.category, dry_run=args.dry_run)
|
stats = process_all(args.category, dry_run=args.dry_run, debug_raw=args.debug_raw)
|
||||||
print("\n[Stats] created={created} updated={updated} skipped={skipped} failed={failed}".format(**stats))
|
print("\n[Stats] created={created} updated={updated} skipped={skipped} failed={failed}".format(**stats))
|
||||||
else:
|
else:
|
||||||
print(f"[Main] Import single exercise: {args.title}")
|
print(f"[Main] Import single exercise: {args.title}")
|
||||||
result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run)
|
result = process_one(args.title, args.category, mutate=False, dry_run=args.dry_run, debug_raw=args.debug_raw)
|
||||||
print(f"[Result] {result}")
|
print(f"[Result] {result}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user