Issue: Only 2 exercises found instead of 200+
Root cause: Exercises likely organized in subcategories
Solution:
- Added recursive parameter to get_category_members()
- New _get_subcategories() helper method
- Recursively traverses all subcategories
- Logs subcategory counts for debugging
Behavior:
- get_category_members('Übungen') now finds:
1. All pages directly in 'Kategorie:Übungen'
2. All subcategories (e.g. 'Kihon', 'Kata', 'Kumite')
3. All pages in those subcategories (recursive)
Example structure:
Kategorie:Übungen
├─ Seite: Übung A (2 direkte)
├─ Kategorie:Kihon
│ ├─ Seite: Mae-Geri
│ └─ Seite: Gyaku-Zuki
└─ Kategorie:Kata
└─ Seite: Heian Shodan
255 lines
9.7 KiB
Python
255 lines
9.7 KiB
Python
"""
|
|
Semantic MediaWiki API Client
|
|
|
|
Greift direkt auf die MediaWiki API zu (kein XML-Export).
|
|
Unterstützt Login, Kategorien-Abfragen und SMW Browse-API.
|
|
"""
|
|
import os
|
|
import logging
|
|
import httpx
|
|
from typing import Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "")
|
|
MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "")
|
|
MEDIAWIKI_PASSWORD = os.getenv("MEDIAWIKI_PASSWORD", "")
|
|
|
|
|
|
class SmwClientError(Exception):
|
|
pass
|
|
|
|
|
|
class SmwClient:
|
|
"""Stateless MediaWiki/SMW API Client mit Session-Login."""
|
|
|
|
def __init__(self, api_url: str = None, user: str = None, password: str = None):
|
|
self.api_url = (api_url or MEDIAWIKI_API_URL).rstrip("/")
|
|
self.user = user or MEDIAWIKI_USER
|
|
self.password = password or MEDIAWIKI_PASSWORD
|
|
self._cookies: dict = {}
|
|
self._logged_in = False
|
|
|
|
if not self.api_url:
|
|
raise SmwClientError("MEDIAWIKI_API_URL nicht konfiguriert")
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Authentication #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def login(self) -> None:
|
|
"""MediaWiki Login (zwei Schritte: Token holen → Login ausführen)."""
|
|
async with httpx.AsyncClient(timeout=15) as client:
|
|
# Schritt 1: Login-Token holen
|
|
r1 = await client.get(self.api_url, params={
|
|
"action": "query",
|
|
"meta": "tokens",
|
|
"type": "login",
|
|
"format": "json",
|
|
})
|
|
r1.raise_for_status()
|
|
token = r1.json()["query"]["tokens"]["logintoken"]
|
|
cookies = dict(r1.cookies)
|
|
|
|
# Schritt 2: Einloggen
|
|
r2 = await client.post(self.api_url, params={"format": "json"}, data={
|
|
"action": "login",
|
|
"lgname": self.user,
|
|
"lgpassword": self.password,
|
|
"lgtoken": token,
|
|
}, cookies=cookies)
|
|
r2.raise_for_status()
|
|
result = r2.json()
|
|
|
|
if result.get("login", {}).get("result") != "Success":
|
|
reason = result.get("login", {}).get("reason", "unbekannt")
|
|
raise SmwClientError(f"MediaWiki Login fehlgeschlagen: {reason}")
|
|
|
|
self._cookies = dict(r2.cookies)
|
|
self._logged_in = True
|
|
logger.info("SMW Login erfolgreich als '%s'", self.user)
|
|
|
|
async def _get(self, params: dict) -> dict:
|
|
"""Authentifizierter GET-Request gegen die API."""
|
|
if not self._logged_in:
|
|
await self.login()
|
|
params["format"] = "json"
|
|
async with httpx.AsyncClient(timeout=30, cookies=self._cookies) as client:
|
|
r = await client.get(self.api_url, params=params)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Kategorien #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]:
|
|
"""
|
|
Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien).
|
|
Gibt Liste von {"pageid": int, "title": str} zurück.
|
|
"""
|
|
members = []
|
|
cmcontinue = None
|
|
|
|
while True:
|
|
params = {
|
|
"action": "query",
|
|
"list": "categorymembers",
|
|
"cmtitle": f"Kategorie:{category}",
|
|
"cmlimit": min(limit, 500),
|
|
"cmtype": "page", # Nur Seiten, keine Unterkategorien
|
|
"cmprop": "ids|title",
|
|
}
|
|
if cmcontinue:
|
|
params["cmcontinue"] = cmcontinue
|
|
|
|
data = await self._get(params)
|
|
members.extend(data["query"]["categorymembers"])
|
|
|
|
if "continue" in data and len(members) < limit:
|
|
cmcontinue = data["continue"].get("cmcontinue")
|
|
else:
|
|
break
|
|
|
|
# Rekursiv durch Unterkategorien gehen
|
|
if recursive:
|
|
subcats = await self._get_subcategories(category)
|
|
logger.info(f"Kategorie '{category}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien")
|
|
|
|
for subcat in subcats:
|
|
if len(members) >= limit:
|
|
break
|
|
subcat_name = subcat["title"].replace("Kategorie:", "")
|
|
subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True)
|
|
members.extend(subcat_members)
|
|
|
|
return members[:limit]
|
|
|
|
async def _get_subcategories(self, category: str) -> list[dict]:
|
|
"""Gibt alle Unterkategorien einer Kategorie zurück."""
|
|
subcats = []
|
|
cmcontinue = None
|
|
|
|
while True:
|
|
params = {
|
|
"action": "query",
|
|
"list": "categorymembers",
|
|
"cmtitle": f"Kategorie:{category}",
|
|
"cmlimit": 500,
|
|
"cmtype": "subcat", # Nur Unterkategorien
|
|
"cmprop": "ids|title",
|
|
}
|
|
if cmcontinue:
|
|
params["cmcontinue"] = cmcontinue
|
|
|
|
data = await self._get(params)
|
|
subcats.extend(data["query"]["categorymembers"])
|
|
|
|
if "continue" in data:
|
|
cmcontinue = data["continue"].get("cmcontinue")
|
|
else:
|
|
break
|
|
|
|
return subcats
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Seiteninhalte #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def get_page_wikitext(self, title: str) -> str:
|
|
"""Rohen Wikitext einer Seite abrufen."""
|
|
data = await self._get({
|
|
"action": "query",
|
|
"titles": title,
|
|
"prop": "revisions",
|
|
"rvprop": "content",
|
|
"rvslots": "main",
|
|
})
|
|
pages = data["query"]["pages"]
|
|
page = next(iter(pages.values()))
|
|
if "missing" in page:
|
|
raise SmwClientError(f"Seite '{title}' nicht gefunden")
|
|
return page["revisions"][0]["slots"]["main"]["*"]
|
|
|
|
async def get_page_html(self, title: str) -> str:
|
|
"""Geparsten HTML-Inhalt einer Seite abrufen."""
|
|
data = await self._get({
|
|
"action": "parse",
|
|
"page": title,
|
|
"prop": "text",
|
|
})
|
|
return data["parse"]["text"]["*"]
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Semantic MediaWiki #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def browse_subject(self, title: str) -> dict:
|
|
"""
|
|
SMW Browse-API: Gibt alle Properties (Attribute) einer Seite zurück.
|
|
Gibt dict {property_name: [value, ...]} zurück.
|
|
"""
|
|
data = await self._get({
|
|
"action": "browsebysubject",
|
|
"subject": title,
|
|
})
|
|
if "error" in data:
|
|
raise SmwClientError(f"SMW Browse-Fehler für '{title}': {data['error']}")
|
|
|
|
# Normalisiere: {property_label: [wert1, wert2]}
|
|
result = {}
|
|
for prop_data in data.get("query", {}).get("data", []):
|
|
prop_label = prop_data.get("property", "")
|
|
if prop_label.startswith("_"): # Interne SMW-Properties überspringen
|
|
continue
|
|
values = []
|
|
for item in prop_data.get("dataitem", []):
|
|
raw = item.get("item", "")
|
|
# SMW codiert Werte manchmal als "Wert#0##" → bereinigen
|
|
clean = raw.split("#")[0].strip() if "#" in raw else raw.strip()
|
|
if clean:
|
|
values.append(clean)
|
|
if values:
|
|
result[prop_label] = values
|
|
return result
|
|
|
|
async def ask_query(self, query: str, limit: int = 100) -> list[dict]:
|
|
"""
|
|
SMW Ask-API: Semantische Abfrage.
|
|
Beispiel: query = "[[Kategorie:Übungen]]|?Fokusbereich|?Ziel"
|
|
Gibt Liste von {title, properties} zurück.
|
|
"""
|
|
data = await self._get({
|
|
"action": "ask",
|
|
"query": f"{query}|limit={limit}",
|
|
})
|
|
if "error" in data:
|
|
raise SmwClientError(f"SMW Ask-Fehler: {data['error']}")
|
|
|
|
results = []
|
|
for title, props in data.get("query", {}).get("results", {}).items():
|
|
entry = {"title": title, "properties": {}}
|
|
for prop_name, prop_data in props.get("printouts", {}).items():
|
|
values = []
|
|
for item in prop_data:
|
|
if isinstance(item, dict):
|
|
values.append(item.get("fulltext") or item.get("raw") or str(item))
|
|
else:
|
|
values.append(str(item))
|
|
entry["properties"][prop_name] = values
|
|
results.append(entry)
|
|
return results
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# Schema-Discovery #
|
|
# ------------------------------------------------------------------ #
|
|
|
|
async def discover_properties(self, sample_title: str) -> dict:
|
|
"""
|
|
Gibt alle SMW-Properties einer Beispielseite zurück.
|
|
Nützlich um die Property-Namen zu ermitteln bevor der Mapper gebaut wird.
|
|
"""
|
|
props = await self.browse_subject(sample_title)
|
|
logger.info("Properties von '%s': %s", sample_title, list(props.keys()))
|
|
return props
|