shinkan-jinkendo/backend/smw_client.py
Lars 623af621b4
All checks were successful
Deploy Development / deploy (push) Successful in 38s
Test Suite / pytest-backend (push) Successful in 40s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 12s
Test Suite / k6 /health Baseline (push) Successful in 33s
Test Suite / playwright-tests (push) Successful in 1m17s
Test Suite / pytest-backend (pull_request) Successful in 35s
Test Suite / lint-backend (pull_request) Successful in 0s
Test Suite / build-frontend (pull_request) Successful in 12s
Test Suite / k6 /health Baseline (pull_request) Successful in 33s
Test Suite / playwright-tests (pull_request) Successful in 1m8s
Enhance MediaWiki import functionality with category normalization and skill attributes
- Introduced `_normalize_mw_category` function to clean category names for API calls, ensuring consistent handling of category prefixes.
- Updated `SmwClient` methods to utilize normalized category names, improving data retrieval accuracy.
- Added `_wiki_category_or_default` function to provide default categories based on import type, enhancing user experience during imports.
- Integrated new fields `karate_relevance` and `relevance_level` into various admin components, allowing for better skill management.
- Incremented app version to 0.8.145 and updated changelog to reflect these changes.
2026-05-16 11:05:15 +02:00

294 lines
11 KiB
Python

"""
Semantic MediaWiki API Client
Greift direkt auf die MediaWiki API zu (kein XML-Export).
Unterstützt Login, Kategorien-Abfragen und SMW Browse-API.
"""
import os
import logging
import httpx
from typing import Optional
logger = logging.getLogger(__name__)
MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "")
MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "")
MEDIAWIKI_PASSWORD = os.getenv("MEDIAWIKI_PASSWORD", "")
class SmwClientError(Exception):
pass
def _normalize_mw_category(category: str) -> str:
"""
Bereinigt Kategorienamen für API cmtitle=Kategorie:…
Erlaubt z. B. 'Fähigkeitsbeschreibung', ' Kategorie:X ', 'kategorie:X' ohne Doppel-Prefix.
"""
c = (category or "").strip()
if not c:
raise SmwClientError("Kategorie (Seitenlisten-Name ohne Präfix) darf nicht leer sein")
pref = "kategorie:"
while c.lower().startswith(pref):
c = c[len(pref) :].lstrip()
remaining = (c or "").strip()
if not remaining:
raise SmwClientError("Kategorie (Seitenlisten-Name ohne Präfix) darf nicht leer sein")
return remaining
class SmwClient:
"""Stateless MediaWiki/SMW API Client mit Session-Login."""
def __init__(self, api_url: str = None, user: str = None, password: str = None):
self.api_url = (api_url or MEDIAWIKI_API_URL).rstrip("/")
self.user = user or MEDIAWIKI_USER
self.password = password or MEDIAWIKI_PASSWORD
self._cookies: dict = {}
self._logged_in = False
if not self.api_url:
raise SmwClientError("MEDIAWIKI_API_URL nicht konfiguriert")
# ------------------------------------------------------------------ #
# Authentication #
# ------------------------------------------------------------------ #
async def login(self) -> None:
"""MediaWiki Login (zwei Schritte: Token holen → Login ausführen)."""
async with httpx.AsyncClient(timeout=15) as client:
# Schritt 1: Login-Token holen
r1 = await client.get(self.api_url, params={
"action": "query",
"meta": "tokens",
"type": "login",
"format": "json",
})
r1.raise_for_status()
j = self._parse_mw_response(r1.json())
tok = (((j.get("query") or {}).get("tokens") or {}).get("logintoken"))
if not tok:
raise SmwClientError("MediaWiki Login-Token fehlt in API-Antwort")
cookies = dict(r1.cookies)
# Schritt 2: Einloggen
r2 = await client.post(self.api_url, params={"format": "json"}, data={
"action": "login",
"lgname": self.user,
"lgpassword": self.password,
"lgtoken": tok,
}, cookies=cookies)
r2.raise_for_status()
result = self._parse_mw_response(r2.json())
if result.get("login", {}).get("result") != "Success":
reason = result.get("login", {}).get("reason", "unbekannt")
raise SmwClientError(f"MediaWiki Login fehlgeschlagen: {reason}")
self._cookies = dict(r2.cookies)
self._logged_in = True
logger.info("SMW Login erfolgreich als '%s'", self.user)
def _parse_mw_response(self, data: dict) -> dict:
"""MediaWiki liefert oft HTTP 200 mit {\"error\": {...}} — sonst KeyErrors in Client-Code."""
if not isinstance(data, dict):
raise SmwClientError("Ungültige API-Antwort (kein JSON-Objekt)")
err = data.get("error")
if isinstance(err, dict):
code = err.get("code", "")
info = err.get("info") or err.get("*") or err.get("message") or ""
raise SmwClientError(f"MediaWiki API: {code}: {info}".strip())
return data
async def _get(self, params: dict) -> dict:
"""Authentifizierter GET-Request gegen die API."""
if not self._logged_in:
await self.login()
params["format"] = "json"
try:
async with httpx.AsyncClient(timeout=30, cookies=self._cookies) as client:
r = await client.get(self.api_url, params=params)
r.raise_for_status()
return self._parse_mw_response(r.json())
except SmwClientError:
raise
except Exception as e:
raise SmwClientError(str(e))
# ------------------------------------------------------------------ #
# Kategorien #
# ------------------------------------------------------------------ #
async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]:
"""
Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien).
Gibt Liste von {"pageid": int, "title": str} zurück.
"""
members = []
cmcontinue = None
cat = _normalize_mw_category(category)
while True:
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Kategorie:{cat}",
"cmlimit": min(limit, 500),
"cmtype": "page", # Nur Seiten, keine Unterkategorien
"cmprop": "ids|title",
}
if cmcontinue:
params["cmcontinue"] = cmcontinue
data = await self._get(params)
q = data.get("query") or {}
members.extend(q.get("categorymembers") or [])
if "continue" in data and len(members) < limit:
cmcontinue = data["continue"].get("cmcontinue")
else:
break
# Rekursiv durch Unterkategorien gehen
if recursive:
subcats = await self._get_subcategories(cat)
logger.info(f"Kategorie '{cat}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien")
for subcat in subcats:
if len(members) >= limit:
break
subcat_name = subcat["title"].replace("Kategorie:", "")
subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True)
members.extend(subcat_members)
return members[:limit]
async def _get_subcategories(self, category: str) -> list[dict]:
"""Gibt alle Unterkategorien einer Kategorie zurück."""
subcats = []
cmcontinue = None
cat = _normalize_mw_category(category)
while True:
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Kategorie:{cat}",
"cmlimit": 500,
"cmtype": "subcat", # Nur Unterkategorien
"cmprop": "ids|title",
}
if cmcontinue:
params["cmcontinue"] = cmcontinue
data = await self._get(params)
q = data.get("query") or {}
subcats.extend(q.get("categorymembers") or [])
if "continue" in data:
cmcontinue = data["continue"].get("cmcontinue")
else:
break
return subcats
# ------------------------------------------------------------------ #
# Seiteninhalte #
# ------------------------------------------------------------------ #
async def get_page_wikitext(self, title: str) -> str:
"""Rohen Wikitext einer Seite abrufen."""
data = await self._get({
"action": "query",
"titles": title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "main",
})
pages = data["query"]["pages"]
page = next(iter(pages.values()))
if "missing" in page:
raise SmwClientError(f"Seite '{title}' nicht gefunden")
return page["revisions"][0]["slots"]["main"]["*"]
async def get_page_html(self, title: str) -> str:
"""Geparsten HTML-Inhalt einer Seite abrufen."""
data = await self._get({
"action": "parse",
"page": title,
"prop": "text",
})
return data["parse"]["text"]["*"]
# ------------------------------------------------------------------ #
# Semantic MediaWiki #
# ------------------------------------------------------------------ #
async def browse_subject(self, title: str) -> dict:
"""
SMW Browse-API: Gibt alle Properties (Attribute) einer Seite zurück.
Gibt dict {property_name: [value, ...]} zurück.
"""
data = await self._get({
"action": "browsebysubject",
"subject": title,
})
# Normalisiere: {property_label: [wert1, wert2]}
result = {}
for prop_data in (data.get("query") or {}).get("data") or []:
prop_label = prop_data.get("property", "")
if prop_label.startswith("_"): # Interne SMW-Properties überspringen
continue
values = []
for item in prop_data.get("dataitem", []):
raw = item.get("item", "")
# SMW codiert Werte manchmal als "Wert#0##" → bereinigen
clean = raw.split("#")[0].strip() if "#" in raw else raw.strip()
if clean:
values.append(clean)
if values:
result[prop_label] = values
return result
async def ask_query(self, query: str, limit: int = 100) -> list[dict]:
"""
SMW Ask-API: Semantische Abfrage.
Beispiel: query = "[[Kategorie:Übungen]]|?Fokusbereich|?Ziel"
Gibt Liste von {title, properties} zurück.
"""
data = await self._get({
"action": "ask",
"query": f"{query}|limit={limit}",
})
results = []
for title, props in data.get("query", {}).get("results", {}).items():
entry = {"title": title, "properties": {}}
for prop_name, prop_data in props.get("printouts", {}).items():
values = []
for item in prop_data:
if isinstance(item, dict):
values.append(item.get("fulltext") or item.get("raw") or str(item))
else:
values.append(str(item))
entry["properties"][prop_name] = values
results.append(entry)
return results
# ------------------------------------------------------------------ #
# Schema-Discovery #
# ------------------------------------------------------------------ #
async def discover_properties(self, sample_title: str) -> dict:
"""
Gibt alle SMW-Properties einer Beispielseite zurück.
Nützlich um die Property-Namen zu ermitteln bevor der Mapper gebaut wird.
"""
props = await self.browse_subject(sample_title)
logger.info("Properties von '%s': %s", sample_title, list(props.keys()))
return props