All checks were successful
Deploy Development / deploy (push) Successful in 38s
Test Suite / pytest-backend (push) Successful in 40s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 12s
Test Suite / k6 /health Baseline (push) Successful in 33s
Test Suite / playwright-tests (push) Successful in 1m17s
Test Suite / pytest-backend (pull_request) Successful in 35s
Test Suite / lint-backend (pull_request) Successful in 0s
Test Suite / build-frontend (pull_request) Successful in 12s
Test Suite / k6 /health Baseline (pull_request) Successful in 33s
Test Suite / playwright-tests (pull_request) Successful in 1m8s
- Introduced `_normalize_mw_category` function to clean category names for API calls, ensuring consistent handling of category prefixes. - Updated `SmwClient` methods to utilize normalized category names, improving data retrieval accuracy. - Added `_wiki_category_or_default` function to provide default categories based on import type, enhancing user experience during imports. - Integrated new fields `karate_relevance` and `relevance_level` into various admin components, allowing for better skill management. - Incremented app version to 0.8.145 and updated changelog to reflect these changes.
294 lines
11 KiB
Python
294 lines
11 KiB
Python
"""
|
|
Semantic MediaWiki API Client
|
|
|
|
Greift direkt auf die MediaWiki API zu (kein XML-Export).
|
|
Unterstützt Login, Kategorien-Abfragen und SMW Browse-API.
|
|
"""
|
|
import os
|
|
import logging
|
|
import httpx
|
|
from typing import Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "")
|
|
MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "")
|
|
MEDIAWIKI_PASSWORD = os.getenv("MEDIAWIKI_PASSWORD", "")
|
|
|
|
|
|
class SmwClientError(Exception):
|
|
pass
|
|
|
|
|
|
def _normalize_mw_category(category: str) -> str:
|
|
"""
|
|
Bereinigt Kategorienamen für API cmtitle=Kategorie:…
|
|
Erlaubt z. B. 'Fähigkeitsbeschreibung', ' Kategorie:X ', 'kategorie:X' ohne Doppel-Prefix.
|
|
"""
|
|
c = (category or "").strip()
|
|
if not c:
|
|
raise SmwClientError("Kategorie (Seitenlisten-Name ohne Präfix) darf nicht leer sein")
|
|
|
|
pref = "kategorie:"
|
|
while c.lower().startswith(pref):
|
|
c = c[len(pref) :].lstrip()
|
|
|
|
remaining = (c or "").strip()
|
|
if not remaining:
|
|
raise SmwClientError("Kategorie (Seitenlisten-Name ohne Präfix) darf nicht leer sein")
|
|
|
|
return remaining
|
|
|
|
|
|
class SmwClient:
|
|
"""Stateless MediaWiki/SMW API Client mit Session-Login."""
|
|
|
|
def __init__(self, api_url: str = None, user: str = None, password: str = None):
|
|
self.api_url = (api_url or MEDIAWIKI_API_URL).rstrip("/")
|
|
self.user = user or MEDIAWIKI_USER
|
|
self.password = password or MEDIAWIKI_PASSWORD
|
|
self._cookies: dict = {}
|
|
self._logged_in = False
|
|
|
|
if not self.api_url:
|
|
raise SmwClientError("MEDIAWIKI_API_URL nicht konfiguriert")
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Authentication #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def login(self) -> None:
|
|
"""MediaWiki Login (zwei Schritte: Token holen → Login ausführen)."""
|
|
async with httpx.AsyncClient(timeout=15) as client:
|
|
# Schritt 1: Login-Token holen
|
|
r1 = await client.get(self.api_url, params={
|
|
"action": "query",
|
|
"meta": "tokens",
|
|
"type": "login",
|
|
"format": "json",
|
|
})
|
|
r1.raise_for_status()
|
|
j = self._parse_mw_response(r1.json())
|
|
tok = (((j.get("query") or {}).get("tokens") or {}).get("logintoken"))
|
|
if not tok:
|
|
raise SmwClientError("MediaWiki Login-Token fehlt in API-Antwort")
|
|
|
|
cookies = dict(r1.cookies)
|
|
|
|
# Schritt 2: Einloggen
|
|
r2 = await client.post(self.api_url, params={"format": "json"}, data={
|
|
"action": "login",
|
|
"lgname": self.user,
|
|
"lgpassword": self.password,
|
|
"lgtoken": tok,
|
|
}, cookies=cookies)
|
|
r2.raise_for_status()
|
|
result = self._parse_mw_response(r2.json())
|
|
|
|
if result.get("login", {}).get("result") != "Success":
|
|
reason = result.get("login", {}).get("reason", "unbekannt")
|
|
raise SmwClientError(f"MediaWiki Login fehlgeschlagen: {reason}")
|
|
|
|
self._cookies = dict(r2.cookies)
|
|
self._logged_in = True
|
|
logger.info("SMW Login erfolgreich als '%s'", self.user)
|
|
|
|
def _parse_mw_response(self, data: dict) -> dict:
|
|
"""MediaWiki liefert oft HTTP 200 mit {\"error\": {...}} — sonst KeyErrors in Client-Code."""
|
|
if not isinstance(data, dict):
|
|
raise SmwClientError("Ungültige API-Antwort (kein JSON-Objekt)")
|
|
err = data.get("error")
|
|
if isinstance(err, dict):
|
|
code = err.get("code", "")
|
|
info = err.get("info") or err.get("*") or err.get("message") or ""
|
|
raise SmwClientError(f"MediaWiki API: {code}: {info}".strip())
|
|
return data
|
|
|
|
async def _get(self, params: dict) -> dict:
|
|
"""Authentifizierter GET-Request gegen die API."""
|
|
if not self._logged_in:
|
|
await self.login()
|
|
params["format"] = "json"
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30, cookies=self._cookies) as client:
|
|
r = await client.get(self.api_url, params=params)
|
|
r.raise_for_status()
|
|
return self._parse_mw_response(r.json())
|
|
except SmwClientError:
|
|
raise
|
|
except Exception as e:
|
|
raise SmwClientError(str(e))
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Kategorien #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]:
|
|
"""
|
|
Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien).
|
|
Gibt Liste von {"pageid": int, "title": str} zurück.
|
|
"""
|
|
members = []
|
|
cmcontinue = None
|
|
cat = _normalize_mw_category(category)
|
|
|
|
while True:
|
|
params = {
|
|
"action": "query",
|
|
"list": "categorymembers",
|
|
"cmtitle": f"Kategorie:{cat}",
|
|
"cmlimit": min(limit, 500),
|
|
"cmtype": "page", # Nur Seiten, keine Unterkategorien
|
|
"cmprop": "ids|title",
|
|
}
|
|
if cmcontinue:
|
|
params["cmcontinue"] = cmcontinue
|
|
|
|
data = await self._get(params)
|
|
q = data.get("query") or {}
|
|
members.extend(q.get("categorymembers") or [])
|
|
|
|
if "continue" in data and len(members) < limit:
|
|
cmcontinue = data["continue"].get("cmcontinue")
|
|
else:
|
|
break
|
|
|
|
# Rekursiv durch Unterkategorien gehen
|
|
if recursive:
|
|
subcats = await self._get_subcategories(cat)
|
|
logger.info(f"Kategorie '{cat}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien")
|
|
|
|
for subcat in subcats:
|
|
if len(members) >= limit:
|
|
break
|
|
subcat_name = subcat["title"].replace("Kategorie:", "")
|
|
subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True)
|
|
members.extend(subcat_members)
|
|
|
|
return members[:limit]
|
|
|
|
async def _get_subcategories(self, category: str) -> list[dict]:
|
|
"""Gibt alle Unterkategorien einer Kategorie zurück."""
|
|
subcats = []
|
|
cmcontinue = None
|
|
cat = _normalize_mw_category(category)
|
|
|
|
while True:
|
|
params = {
|
|
"action": "query",
|
|
"list": "categorymembers",
|
|
"cmtitle": f"Kategorie:{cat}",
|
|
"cmlimit": 500,
|
|
"cmtype": "subcat", # Nur Unterkategorien
|
|
"cmprop": "ids|title",
|
|
}
|
|
if cmcontinue:
|
|
params["cmcontinue"] = cmcontinue
|
|
|
|
data = await self._get(params)
|
|
q = data.get("query") or {}
|
|
subcats.extend(q.get("categorymembers") or [])
|
|
|
|
if "continue" in data:
|
|
cmcontinue = data["continue"].get("cmcontinue")
|
|
else:
|
|
break
|
|
|
|
return subcats
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Seiteninhalte #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def get_page_wikitext(self, title: str) -> str:
|
|
"""Rohen Wikitext einer Seite abrufen."""
|
|
data = await self._get({
|
|
"action": "query",
|
|
"titles": title,
|
|
"prop": "revisions",
|
|
"rvprop": "content",
|
|
"rvslots": "main",
|
|
})
|
|
pages = data["query"]["pages"]
|
|
page = next(iter(pages.values()))
|
|
if "missing" in page:
|
|
raise SmwClientError(f"Seite '{title}' nicht gefunden")
|
|
return page["revisions"][0]["slots"]["main"]["*"]
|
|
|
|
async def get_page_html(self, title: str) -> str:
|
|
"""Geparsten HTML-Inhalt einer Seite abrufen."""
|
|
data = await self._get({
|
|
"action": "parse",
|
|
"page": title,
|
|
"prop": "text",
|
|
})
|
|
return data["parse"]["text"]["*"]
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Semantic MediaWiki #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def browse_subject(self, title: str) -> dict:
|
|
"""
|
|
SMW Browse-API: Gibt alle Properties (Attribute) einer Seite zurück.
|
|
Gibt dict {property_name: [value, ...]} zurück.
|
|
"""
|
|
data = await self._get({
|
|
"action": "browsebysubject",
|
|
"subject": title,
|
|
})
|
|
# Normalisiere: {property_label: [wert1, wert2]}
|
|
result = {}
|
|
for prop_data in (data.get("query") or {}).get("data") or []:
|
|
prop_label = prop_data.get("property", "")
|
|
if prop_label.startswith("_"): # Interne SMW-Properties überspringen
|
|
continue
|
|
values = []
|
|
for item in prop_data.get("dataitem", []):
|
|
raw = item.get("item", "")
|
|
# SMW codiert Werte manchmal als "Wert#0##" → bereinigen
|
|
clean = raw.split("#")[0].strip() if "#" in raw else raw.strip()
|
|
if clean:
|
|
values.append(clean)
|
|
if values:
|
|
result[prop_label] = values
|
|
return result
|
|
|
|
async def ask_query(self, query: str, limit: int = 100) -> list[dict]:
|
|
"""
|
|
SMW Ask-API: Semantische Abfrage.
|
|
Beispiel: query = "[[Kategorie:Übungen]]|?Fokusbereich|?Ziel"
|
|
Gibt Liste von {title, properties} zurück.
|
|
"""
|
|
data = await self._get({
|
|
"action": "ask",
|
|
"query": f"{query}|limit={limit}",
|
|
})
|
|
|
|
results = []
|
|
for title, props in data.get("query", {}).get("results", {}).items():
|
|
entry = {"title": title, "properties": {}}
|
|
for prop_name, prop_data in props.get("printouts", {}).items():
|
|
values = []
|
|
for item in prop_data:
|
|
if isinstance(item, dict):
|
|
values.append(item.get("fulltext") or item.get("raw") or str(item))
|
|
else:
|
|
values.append(str(item))
|
|
entry["properties"][prop_name] = values
|
|
results.append(entry)
|
|
return results
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Schema-Discovery #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def discover_properties(self, sample_title: str) -> dict:
|
|
"""
|
|
Gibt alle SMW-Properties einer Beispielseite zurück.
|
|
Nützlich um die Property-Namen zu ermitteln bevor der Mapper gebaut wird.
|
|
"""
|
|
props = await self.browse_subject(sample_title)
|
|
logger.info("Properties von '%s': %s", sample_title, list(props.keys()))
|
|
return props
|