shinkan-jinkendo/backend/smw_client.py
Lars 8b51864b53
Some checks failed
Deploy Development / deploy (push) Successful in 34s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 5s
Test Suite / playwright-tests (push) Failing after 2m2s
feat: add recursive subcategory search for MediaWiki import
Issue: Only 2 exercises found instead of 200+
Root cause: Exercises likely organized in subcategories

Solution:
- Added recursive parameter to get_category_members()
- New _get_subcategories() helper method
- Recursively traverses all subcategories
- Logs subcategory counts for debugging

Behavior:
- get_category_members('Übungen') now finds:
  1. All pages directly in 'Kategorie:Übungen'
  2. All subcategories (e.g. 'Kihon', 'Kata', 'Kumite')
  3. All pages in those subcategories (recursive)

Example structure:
Kategorie:Übungen
├─ Seite: Übung A (2 direkte)
├─ Kategorie:Kihon
│  ├─ Seite: Mae-Geri
│  └─ Seite: Gyaku-Zuki
└─ Kategorie:Kata
   └─ Seite: Heian Shodan
2026-04-24 17:53:11 +02:00

255 lines
9.7 KiB
Python

"""
Semantic MediaWiki API Client
Greift direkt auf die MediaWiki API zu (kein XML-Export).
Unterstützt Login, Kategorien-Abfragen und SMW Browse-API.
"""
import os
import logging
import httpx
from typing import Optional
logger = logging.getLogger(__name__)
MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "")
MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "")
MEDIAWIKI_PASSWORD = os.getenv("MEDIAWIKI_PASSWORD", "")
class SmwClientError(Exception):
pass
class SmwClient:
"""Stateless MediaWiki/SMW API Client mit Session-Login."""
def __init__(self, api_url: str = None, user: str = None, password: str = None):
self.api_url = (api_url or MEDIAWIKI_API_URL).rstrip("/")
self.user = user or MEDIAWIKI_USER
self.password = password or MEDIAWIKI_PASSWORD
self._cookies: dict = {}
self._logged_in = False
if not self.api_url:
raise SmwClientError("MEDIAWIKI_API_URL nicht konfiguriert")
# ------------------------------------------------------------------ #
# Authentication #
# ------------------------------------------------------------------ #
async def login(self) -> None:
"""MediaWiki Login (zwei Schritte: Token holen → Login ausführen)."""
async with httpx.AsyncClient(timeout=15) as client:
# Schritt 1: Login-Token holen
r1 = await client.get(self.api_url, params={
"action": "query",
"meta": "tokens",
"type": "login",
"format": "json",
})
r1.raise_for_status()
token = r1.json()["query"]["tokens"]["logintoken"]
cookies = dict(r1.cookies)
# Schritt 2: Einloggen
r2 = await client.post(self.api_url, params={"format": "json"}, data={
"action": "login",
"lgname": self.user,
"lgpassword": self.password,
"lgtoken": token,
}, cookies=cookies)
r2.raise_for_status()
result = r2.json()
if result.get("login", {}).get("result") != "Success":
reason = result.get("login", {}).get("reason", "unbekannt")
raise SmwClientError(f"MediaWiki Login fehlgeschlagen: {reason}")
self._cookies = dict(r2.cookies)
self._logged_in = True
logger.info("SMW Login erfolgreich als '%s'", self.user)
async def _get(self, params: dict) -> dict:
"""Authentifizierter GET-Request gegen die API."""
if not self._logged_in:
await self.login()
params["format"] = "json"
async with httpx.AsyncClient(timeout=30, cookies=self._cookies) as client:
r = await client.get(self.api_url, params=params)
r.raise_for_status()
return r.json()
# ------------------------------------------------------------------ #
# Kategorien #
# ------------------------------------------------------------------ #
async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]:
"""
Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien).
Gibt Liste von {"pageid": int, "title": str} zurück.
"""
members = []
cmcontinue = None
while True:
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Kategorie:{category}",
"cmlimit": min(limit, 500),
"cmtype": "page", # Nur Seiten, keine Unterkategorien
"cmprop": "ids|title",
}
if cmcontinue:
params["cmcontinue"] = cmcontinue
data = await self._get(params)
members.extend(data["query"]["categorymembers"])
if "continue" in data and len(members) < limit:
cmcontinue = data["continue"].get("cmcontinue")
else:
break
# Rekursiv durch Unterkategorien gehen
if recursive:
subcats = await self._get_subcategories(category)
logger.info(f"Kategorie '{category}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien")
for subcat in subcats:
if len(members) >= limit:
break
subcat_name = subcat["title"].replace("Kategorie:", "")
subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True)
members.extend(subcat_members)
return members[:limit]
async def _get_subcategories(self, category: str) -> list[dict]:
"""Gibt alle Unterkategorien einer Kategorie zurück."""
subcats = []
cmcontinue = None
while True:
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Kategorie:{category}",
"cmlimit": 500,
"cmtype": "subcat", # Nur Unterkategorien
"cmprop": "ids|title",
}
if cmcontinue:
params["cmcontinue"] = cmcontinue
data = await self._get(params)
subcats.extend(data["query"]["categorymembers"])
if "continue" in data:
cmcontinue = data["continue"].get("cmcontinue")
else:
break
return subcats
# ------------------------------------------------------------------ #
# Seiteninhalte #
# ------------------------------------------------------------------ #
async def get_page_wikitext(self, title: str) -> str:
"""Rohen Wikitext einer Seite abrufen."""
data = await self._get({
"action": "query",
"titles": title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "main",
})
pages = data["query"]["pages"]
page = next(iter(pages.values()))
if "missing" in page:
raise SmwClientError(f"Seite '{title}' nicht gefunden")
return page["revisions"][0]["slots"]["main"]["*"]
async def get_page_html(self, title: str) -> str:
"""Geparsten HTML-Inhalt einer Seite abrufen."""
data = await self._get({
"action": "parse",
"page": title,
"prop": "text",
})
return data["parse"]["text"]["*"]
# ------------------------------------------------------------------ #
# Semantic MediaWiki #
# ------------------------------------------------------------------ #
async def browse_subject(self, title: str) -> dict:
"""
SMW Browse-API: Gibt alle Properties (Attribute) einer Seite zurück.
Gibt dict {property_name: [value, ...]} zurück.
"""
data = await self._get({
"action": "browsebysubject",
"subject": title,
})
if "error" in data:
raise SmwClientError(f"SMW Browse-Fehler für '{title}': {data['error']}")
# Normalisiere: {property_label: [wert1, wert2]}
result = {}
for prop_data in data.get("query", {}).get("data", []):
prop_label = prop_data.get("property", "")
if prop_label.startswith("_"): # Interne SMW-Properties überspringen
continue
values = []
for item in prop_data.get("dataitem", []):
raw = item.get("item", "")
# SMW codiert Werte manchmal als "Wert#0##" → bereinigen
clean = raw.split("#")[0].strip() if "#" in raw else raw.strip()
if clean:
values.append(clean)
if values:
result[prop_label] = values
return result
async def ask_query(self, query: str, limit: int = 100) -> list[dict]:
"""
SMW Ask-API: Semantische Abfrage.
Beispiel: query = "[[Kategorie:Übungen]]|?Fokusbereich|?Ziel"
Gibt Liste von {title, properties} zurück.
"""
data = await self._get({
"action": "ask",
"query": f"{query}|limit={limit}",
})
if "error" in data:
raise SmwClientError(f"SMW Ask-Fehler: {data['error']}")
results = []
for title, props in data.get("query", {}).get("results", {}).items():
entry = {"title": title, "properties": {}}
for prop_name, prop_data in props.get("printouts", {}).items():
values = []
for item in prop_data:
if isinstance(item, dict):
values.append(item.get("fulltext") or item.get("raw") or str(item))
else:
values.append(str(item))
entry["properties"][prop_name] = values
results.append(entry)
return results
# ------------------------------------------------------------------ #
# Schema-Discovery #
# ------------------------------------------------------------------ #
async def discover_properties(self, sample_title: str) -> dict:
"""
Gibt alle SMW-Properties einer Beispielseite zurück.
Nützlich um die Property-Namen zu ermitteln bevor der Mapper gebaut wird.
"""
props = await self.browse_subject(sample_title)
logger.info("Properties von '%s': %s", sample_title, list(props.keys()))
return props