239 lines
9.3 KiB
Python
239 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Module: wiki_importer.py
|
|
Beschreibung:
|
|
- Importiert Übungen aus dem MediaWiki via FastAPI wiki_router
|
|
- Führt vor dem Import einen Login gegen /import/wiki/login durch (falls nicht via --skip-login deaktiviert)
|
|
- Holt Liste aller Übungs-Titel (SMW-Ask) via `/semantic/pages`
|
|
- Für jede Übung:
|
|
* Fetch pageinfo (pageid, fullurl) via `/info`
|
|
* Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) via `/parsepage`
|
|
* Baut Payload entsprechend Exercise-Datenmodell
|
|
* POST an `/exercise` Endpoint (exercise_router)
|
|
- Unterstützt Single-Import via `--title` (oder ENV `WIKI_EXERCISE_TITLE`) und Full-Import via `--all`
|
|
- Optional: Credentials via CLI (--username/--password) oder `.env` (WIKI_BOT_USER / WIKI_BOT_PASSWORD)
|
|
|
|
Version: 2.1.0
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import argparse
|
|
from typing import Dict, Any
|
|
import requests
|
|
import mwparserfromhell
|
|
from dotenv import load_dotenv
|
|
|
|
# ----- Konfiguration / Defaults -----
|
|
load_dotenv() # .env laden, falls vorhanden
|
|
|
|
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") # FastAPI-Wiki-Proxy
|
|
EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint
|
|
DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
|
|
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
|
|
|
|
# ---- Hilfsfunktionen für Wiki-Router ----
|
|
def wiki_health() -> None:
|
|
r = requests.get(f"{API_BASE_URL}/health", timeout=15)
|
|
r.raise_for_status()
|
|
print("[Sanity] Wiki health OK")
|
|
|
|
def wiki_login(username: str, password: str) -> None:
|
|
"""
|
|
Führt einen Login gegen den wiki_router durch.
|
|
Erwartet: {"status":"success"} bei Erfolg.
|
|
"""
|
|
payload = {"username": username, "password": password}
|
|
r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30)
|
|
# kein raise_for_status(), wir wollen die JSON-Fehler sauber ausgeben
|
|
try:
|
|
data = r.json()
|
|
except Exception:
|
|
print(f"[Login] HTTP {r.status_code}: {r.text}")
|
|
r.raise_for_status()
|
|
|
|
status = (data or {}).get("status")
|
|
if status != "success":
|
|
msg = (data or {}).get("message", "Login fehlgeschlagen")
|
|
raise RuntimeError(f"[Login] {msg}")
|
|
print("[Login] success")
|
|
|
|
def fetch_all_pages(category: str) -> Dict[str, Any]:
|
|
resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=60)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
def fetch_page_info(title: str) -> Dict[str, Any]:
|
|
r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30)
|
|
r.raise_for_status()
|
|
info = r.json()
|
|
return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}
|
|
|
|
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
|
print(f"[Parse] Lade '{title}' (ID={pageid})")
|
|
resp = requests.get(
|
|
f"{API_BASE_URL}/parsepage",
|
|
params={"pageid": pageid, "title": title},
|
|
timeout=60
|
|
)
|
|
resp.raise_for_status()
|
|
wikitext = resp.json().get("wikitext", "")
|
|
wikicode = mwparserfromhell.parse(wikitext)
|
|
|
|
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
|
|
for tpl in wikicode.filter_templates():
|
|
name = str(tpl.name).strip()
|
|
if name == "ÜbungInfoBox":
|
|
for p in tpl.params:
|
|
raw[str(p.name).strip()] = str(p.value).strip()
|
|
elif name == "Übungsbeschreibung":
|
|
for p in tpl.params:
|
|
raw[str(p.name).strip()] = str(p.value).strip()
|
|
elif name == "SkillDevelopment":
|
|
raw.setdefault("capabilities", [])
|
|
try:
|
|
cap = str(tpl.get("PrimaryCapability").value).strip()
|
|
except Exception:
|
|
cap = ""
|
|
try:
|
|
lvl = int(str(tpl.get("CapabilityLevel").value).strip())
|
|
except Exception:
|
|
lvl = 0
|
|
if cap:
|
|
raw["capabilities"].append({"capability": cap, "level": lvl})
|
|
raw["wikitext"] = wikitext
|
|
return raw
|
|
|
|
def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]:
|
|
# Exercise.capabilities erwartet Dict[str,int]
|
|
caps_list = raw.get("capabilities", [])
|
|
capabilities = {}
|
|
for c in caps_list:
|
|
cap = c.get("capability")
|
|
lvl = c.get("level")
|
|
if isinstance(cap, str) and cap:
|
|
try:
|
|
capabilities[cap] = int(lvl)
|
|
except Exception:
|
|
pass
|
|
|
|
# Defaults/Fallbacks
|
|
duration = 0.0
|
|
try:
|
|
duration = float(raw.get("Dauer", 0) or 0)
|
|
except Exception:
|
|
duration = 0.0
|
|
|
|
keywords = []
|
|
kw_raw = raw.get("Schlüsselworte", "")
|
|
if isinstance(kw_raw, str):
|
|
keywords = [k.strip() for k in kw_raw.split(",") if k.strip()]
|
|
|
|
equipment = []
|
|
eq_raw = raw.get("equipment", [])
|
|
if isinstance(eq_raw, str):
|
|
equipment = [e.strip() for e in eq_raw.split(",") if e.strip()]
|
|
elif isinstance(eq_raw, list):
|
|
equipment = [str(e).strip() for e in eq_raw if str(e).strip()]
|
|
|
|
payload: Dict[str, Any] = {
|
|
"title": raw.get("title") or "",
|
|
"summary": raw.get("Summary", "") or "",
|
|
"short_description": raw.get("Summary", "") or "",
|
|
"keywords": keywords,
|
|
"link": fullurl or "",
|
|
"discipline": raw.get("Übungstyp", "") or "",
|
|
"group": raw.get("Gruppengröße", "") or None,
|
|
"age_group": raw.get("Altersgruppe", "") or "",
|
|
"target_group": raw.get("Zielgruppe", "") or "",
|
|
"min_participants": 1,
|
|
"duration_minutes": int(round(duration)), # Exercise erwartet int
|
|
"capabilities": capabilities,
|
|
"category": category or "",
|
|
"purpose": raw.get("Ziel", "") or "",
|
|
"execution": raw.get("Durchführung", "") or "",
|
|
"notes": raw.get("Hinweise", "") or "",
|
|
"preparation": raw.get("RefMethode", "") or "",
|
|
"method": raw.get("method", "") or "", # falls im Wikitext vorhanden
|
|
"equipment": equipment,
|
|
"fullurl": fullurl or "", # optionales Feld
|
|
# Idempotenz (optional nutzbar in exercise_router):
|
|
"external_id": f"wiki:{raw.get('pageid')}",
|
|
"source": "MediaWiki"
|
|
}
|
|
return payload
|
|
|
|
def ingest_exercise(payload: Dict[str, Any]) -> None:
|
|
title = payload.get("title", "<ohne Titel>")
|
|
resp = requests.post(EXERCISE_API, json=payload, timeout=60)
|
|
if resp.status_code == 422:
|
|
print(f"[Ingest] '{title}' -> FAILED 422:\n{resp.text}")
|
|
try:
|
|
resp.raise_for_status()
|
|
except Exception:
|
|
pass
|
|
return
|
|
resp.raise_for_status()
|
|
print(f"[Ingest] '{title}' -> OK")
|
|
|
|
# ----- Main -----
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant (via FastAPI wiki_router)")
|
|
parser.add_argument("--all", action="store_true", help="Alle Übungen importieren (SMW-Ask)")
|
|
parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Einzelimport eines Übungstitels")
|
|
parser.add_argument("--category", type=str, default=DEFAULT_CAT, help="Wiki-Kategorie (z.B. 'Übungen')")
|
|
parser.add_argument("--username", type=str, default=os.getenv("WIKI_BOT_USER"), help="Wiki-Login Benutzer (überschreibt .env)")
|
|
parser.add_argument("--password", type=str, default=os.getenv("WIKI_BOT_PASSWORD"), help="Wiki-Login Passwort (überschreibt .env)")
|
|
parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Sanity
|
|
wiki_health()
|
|
|
|
# Login (sofern nicht explizit übersprungen)
|
|
if not args.skip_login:
|
|
if not args.username or not args.password:
|
|
print("[Login] Fehler: fehlende Credentials. Setze .env (WIKI_BOT_USER/WIKI_BOT_PASSWORD) oder CLI --username/--password.", file=sys.stderr)
|
|
sys.exit(1)
|
|
try:
|
|
wiki_login(args.username, args.password)
|
|
except Exception as e:
|
|
print(str(e), file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Einzel- oder Vollimport
|
|
if args.all:
|
|
print(f"[Main] Lade Liste der Übungen aus Kategorie '{args.category}'…")
|
|
pages = fetch_all_pages(args.category)
|
|
print(f"[Main] {len(pages)} Seiten gefunden.")
|
|
for title, entry in pages.items():
|
|
pid = entry.get("pageid")
|
|
fullurl = entry.get("fullurl")
|
|
if not pid:
|
|
# Core-Info nachschlagen
|
|
info = fetch_page_info(title)
|
|
pid = info.get("pageid")
|
|
fullurl = fullurl or info.get("fullurl")
|
|
if not pid:
|
|
print(f"[Skip] '{title}' hat keine pageid")
|
|
continue
|
|
raw = parse_exercise(title, pid)
|
|
payload = build_payload(raw, fullurl or "", args.category)
|
|
ingest_exercise(payload)
|
|
else:
|
|
print(f"[Main] Import single exercise: {args.title}")
|
|
info = fetch_page_info(args.title)
|
|
pid = info.get("pageid")
|
|
fullurl = info.get("fullurl") or ""
|
|
if not pid:
|
|
print(f"[Error] pageid für '{args.title}' nicht gefunden.", file=sys.stderr)
|
|
sys.exit(1)
|
|
raw = parse_exercise(args.title, pid)
|
|
payload = build_payload(raw, fullurl, args.category)
|
|
ingest_exercise(payload)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|