shinkan-jinkendo/backend/smw_client.py
Lars 6801c60604
Some checks failed
Deploy Development / deploy (push) Successful in 35s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 5s
Test Suite / playwright-tests (push) Failing after 1m55s
feat: Add MediaWiki import functionality with tracking and mapping
- Implemented a new SQL migration for wiki import tracking tables.
- Created an import router for handling MediaWiki imports of exercises, skills, and methods.
- Developed a Semantic MediaWiki API client for direct API interactions.
- Added a mapper to convert SMW properties to local database fields.
- Introduced background tasks for asynchronous import processing.
- Implemented logging and error handling for import operations.
- Added endpoints for previewing imports, checking import status, and managing import references.
2026-04-24 14:41:52 +02:00

216 lines
8.2 KiB
Python

"""
Semantic MediaWiki API Client
Greift direkt auf die MediaWiki API zu (kein XML-Export).
Unterstützt Login, Kategorien-Abfragen und SMW Browse-API.
"""
import os
import logging
import httpx
from typing import Optional
logger = logging.getLogger(__name__)
MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "")
MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "")
MEDIAWIKI_PASSWORD = os.getenv("MEDIAWIKI_PASSWORD", "")
class SmwClientError(Exception):
pass
class SmwClient:
"""Stateless MediaWiki/SMW API Client mit Session-Login."""
def __init__(self, api_url: str = None, user: str = None, password: str = None):
self.api_url = (api_url or MEDIAWIKI_API_URL).rstrip("/")
self.user = user or MEDIAWIKI_USER
self.password = password or MEDIAWIKI_PASSWORD
self._cookies: dict = {}
self._logged_in = False
if not self.api_url:
raise SmwClientError("MEDIAWIKI_API_URL nicht konfiguriert")
# ------------------------------------------------------------------ #
# Authentication #
# ------------------------------------------------------------------ #
async def login(self) -> None:
"""MediaWiki Login (zwei Schritte: Token holen → Login ausführen)."""
async with httpx.AsyncClient(timeout=15) as client:
# Schritt 1: Login-Token holen
r1 = await client.get(self.api_url, params={
"action": "query",
"meta": "tokens",
"type": "login",
"format": "json",
})
r1.raise_for_status()
token = r1.json()["query"]["tokens"]["logintoken"]
cookies = dict(r1.cookies)
# Schritt 2: Einloggen
r2 = await client.post(self.api_url, params={"format": "json"}, data={
"action": "login",
"lgname": self.user,
"lgpassword": self.password,
"lgtoken": token,
}, cookies=cookies)
r2.raise_for_status()
result = r2.json()
if result.get("login", {}).get("result") != "Success":
reason = result.get("login", {}).get("reason", "unbekannt")
raise SmwClientError(f"MediaWiki Login fehlgeschlagen: {reason}")
self._cookies = dict(r2.cookies)
self._logged_in = True
logger.info("SMW Login erfolgreich als '%s'", self.user)
async def _get(self, params: dict) -> dict:
"""Authentifizierter GET-Request gegen die API."""
if not self._logged_in:
await self.login()
params["format"] = "json"
async with httpx.AsyncClient(timeout=30, cookies=self._cookies) as client:
r = await client.get(self.api_url, params=params)
r.raise_for_status()
return r.json()
# ------------------------------------------------------------------ #
# Kategorien #
# ------------------------------------------------------------------ #
async def get_category_members(self, category: str, limit: int = 500) -> list[dict]:
"""
Gibt alle Seiten einer Kategorie zurück.
Gibt Liste von {"pageid": int, "title": str} zurück.
"""
members = []
cmcontinue = None
while True:
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Kategorie:{category}",
"cmlimit": min(limit, 500),
"cmtype": "page",
"cmprop": "ids|title",
}
if cmcontinue:
params["cmcontinue"] = cmcontinue
data = await self._get(params)
members.extend(data["query"]["categorymembers"])
if "continue" in data and len(members) < limit:
cmcontinue = data["continue"].get("cmcontinue")
else:
break
return members[:limit]
# ------------------------------------------------------------------ #
# Seiteninhalte #
# ------------------------------------------------------------------ #
async def get_page_wikitext(self, title: str) -> str:
"""Rohen Wikitext einer Seite abrufen."""
data = await self._get({
"action": "query",
"titles": title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "main",
})
pages = data["query"]["pages"]
page = next(iter(pages.values()))
if "missing" in page:
raise SmwClientError(f"Seite '{title}' nicht gefunden")
return page["revisions"][0]["slots"]["main"]["*"]
async def get_page_html(self, title: str) -> str:
"""Geparsten HTML-Inhalt einer Seite abrufen."""
data = await self._get({
"action": "parse",
"page": title,
"prop": "text",
})
return data["parse"]["text"]["*"]
# ------------------------------------------------------------------ #
# Semantic MediaWiki #
# ------------------------------------------------------------------ #
async def browse_subject(self, title: str) -> dict:
"""
SMW Browse-API: Gibt alle Properties (Attribute) einer Seite zurück.
Gibt dict {property_name: [value, ...]} zurück.
"""
data = await self._get({
"action": "browsebysubject",
"subject": title,
})
if "error" in data:
raise SmwClientError(f"SMW Browse-Fehler für '{title}': {data['error']}")
# Normalisiere: {property_label: [wert1, wert2]}
result = {}
for prop_data in data.get("query", {}).get("data", []):
prop_label = prop_data.get("property", "")
if prop_label.startswith("_"): # Interne SMW-Properties überspringen
continue
values = []
for item in prop_data.get("dataitem", []):
raw = item.get("item", "")
# SMW codiert Werte manchmal als "Wert#0##" → bereinigen
clean = raw.split("#")[0].strip() if "#" in raw else raw.strip()
if clean:
values.append(clean)
if values:
result[prop_label] = values
return result
async def ask_query(self, query: str, limit: int = 100) -> list[dict]:
"""
SMW Ask-API: Semantische Abfrage.
Beispiel: query = "[[Kategorie:Übungen]]|?Fokusbereich|?Ziel"
Gibt Liste von {title, properties} zurück.
"""
data = await self._get({
"action": "ask",
"query": f"{query}|limit={limit}",
})
if "error" in data:
raise SmwClientError(f"SMW Ask-Fehler: {data['error']}")
results = []
for title, props in data.get("query", {}).get("results", {}).items():
entry = {"title": title, "properties": {}}
for prop_name, prop_data in props.get("printouts", {}).items():
values = []
for item in prop_data:
if isinstance(item, dict):
values.append(item.get("fulltext") or item.get("raw") or str(item))
else:
values.append(str(item))
entry["properties"][prop_name] = values
results.append(entry)
return results
# ------------------------------------------------------------------ #
# Schema-Discovery #
# ------------------------------------------------------------------ #
async def discover_properties(self, sample_title: str) -> dict:
"""
Gibt alle SMW-Properties einer Beispielseite zurück.
Nützlich um die Property-Namen zu ermitteln bevor der Mapper gebaut wird.
"""
props = await self.browse_subject(sample_title)
logger.info("Properties von '%s': %s", sample_title, list(props.keys()))
return props