""" Semantic MediaWiki API Client Greift direkt auf die MediaWiki API zu (kein XML-Export). Unterstützt Login, Kategorien-Abfragen und SMW Browse-API. """ import os import logging import httpx from typing import Optional logger = logging.getLogger(__name__) MEDIAWIKI_API_URL = os.getenv("MEDIAWIKI_API_URL", "") MEDIAWIKI_USER = os.getenv("MEDIAWIKI_USER", "") MEDIAWIKI_PASSWORD = os.getenv("MEDIAWIKI_PASSWORD", "") class SmwClientError(Exception): pass class SmwClient: """Stateless MediaWiki/SMW API Client mit Session-Login.""" def __init__(self, api_url: str = None, user: str = None, password: str = None): self.api_url = (api_url or MEDIAWIKI_API_URL).rstrip("/") self.user = user or MEDIAWIKI_USER self.password = password or MEDIAWIKI_PASSWORD self._cookies: dict = {} self._logged_in = False if not self.api_url: raise SmwClientError("MEDIAWIKI_API_URL nicht konfiguriert") # ------------------------------------------------------------------ # # Authentication # # ------------------------------------------------------------------ # async def login(self) -> None: """MediaWiki Login (zwei Schritte: Token holen → Login ausführen).""" async with httpx.AsyncClient(timeout=15) as client: # Schritt 1: Login-Token holen r1 = await client.get(self.api_url, params={ "action": "query", "meta": "tokens", "type": "login", "format": "json", }) r1.raise_for_status() j = self._parse_mw_response(r1.json()) tok = (((j.get("query") or {}).get("tokens") or {}).get("logintoken")) if not tok: raise SmwClientError("MediaWiki Login-Token fehlt in API-Antwort") cookies = dict(r1.cookies) # Schritt 2: Einloggen r2 = await client.post(self.api_url, params={"format": "json"}, data={ "action": "login", "lgname": self.user, "lgpassword": self.password, "lgtoken": tok, }, cookies=cookies) r2.raise_for_status() result = self._parse_mw_response(r2.json()) if result.get("login", {}).get("result") != "Success": reason = result.get("login", {}).get("reason", "unbekannt") raise SmwClientError(f"MediaWiki Login fehlgeschlagen: {reason}") self._cookies = dict(r2.cookies) self._logged_in = True logger.info("SMW Login erfolgreich als '%s'", self.user) def _parse_mw_response(self, data: dict) -> dict: """MediaWiki liefert oft HTTP 200 mit {\"error\": {...}} — sonst KeyErrors in Client-Code.""" if not isinstance(data, dict): raise SmwClientError("Ungültige API-Antwort (kein JSON-Objekt)") err = data.get("error") if isinstance(err, dict): code = err.get("code", "") info = err.get("info") or err.get("*") or err.get("message") or "" raise SmwClientError(f"MediaWiki API: {code}: {info}".strip()) return data async def _get(self, params: dict) -> dict: """Authentifizierter GET-Request gegen die API.""" if not self._logged_in: await self.login() params["format"] = "json" try: async with httpx.AsyncClient(timeout=30, cookies=self._cookies) as client: r = await client.get(self.api_url, params=params) r.raise_for_status() return self._parse_mw_response(r.json()) except SmwClientError: raise except Exception as e: raise SmwClientError(str(e)) # ------------------------------------------------------------------ # # Kategorien # # ------------------------------------------------------------------ # async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]: """ Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien). Gibt Liste von {"pageid": int, "title": str} zurück. """ members = [] cmcontinue = None while True: params = { "action": "query", "list": "categorymembers", "cmtitle": f"Kategorie:{category}", "cmlimit": min(limit, 500), "cmtype": "page", # Nur Seiten, keine Unterkategorien "cmprop": "ids|title", } if cmcontinue: params["cmcontinue"] = cmcontinue data = await self._get(params) q = data.get("query") or {} members.extend(q.get("categorymembers") or []) if "continue" in data and len(members) < limit: cmcontinue = data["continue"].get("cmcontinue") else: break # Rekursiv durch Unterkategorien gehen if recursive: subcats = await self._get_subcategories(category) logger.info(f"Kategorie '{category}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien") for subcat in subcats: if len(members) >= limit: break subcat_name = subcat["title"].replace("Kategorie:", "") subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True) members.extend(subcat_members) return members[:limit] async def _get_subcategories(self, category: str) -> list[dict]: """Gibt alle Unterkategorien einer Kategorie zurück.""" subcats = [] cmcontinue = None while True: params = { "action": "query", "list": "categorymembers", "cmtitle": f"Kategorie:{category}", "cmlimit": 500, "cmtype": "subcat", # Nur Unterkategorien "cmprop": "ids|title", } if cmcontinue: params["cmcontinue"] = cmcontinue data = await self._get(params) q = data.get("query") or {} subcats.extend(q.get("categorymembers") or []) if "continue" in data: cmcontinue = data["continue"].get("cmcontinue") else: break return subcats # ------------------------------------------------------------------ # # Seiteninhalte # # ------------------------------------------------------------------ # async def get_page_wikitext(self, title: str) -> str: """Rohen Wikitext einer Seite abrufen.""" data = await self._get({ "action": "query", "titles": title, "prop": "revisions", "rvprop": "content", "rvslots": "main", }) pages = data["query"]["pages"] page = next(iter(pages.values())) if "missing" in page: raise SmwClientError(f"Seite '{title}' nicht gefunden") return page["revisions"][0]["slots"]["main"]["*"] async def get_page_html(self, title: str) -> str: """Geparsten HTML-Inhalt einer Seite abrufen.""" data = await self._get({ "action": "parse", "page": title, "prop": "text", }) return data["parse"]["text"]["*"] # ------------------------------------------------------------------ # # Semantic MediaWiki # # ------------------------------------------------------------------ # async def browse_subject(self, title: str) -> dict: """ SMW Browse-API: Gibt alle Properties (Attribute) einer Seite zurück. Gibt dict {property_name: [value, ...]} zurück. """ data = await self._get({ "action": "browsebysubject", "subject": title, }) # Normalisiere: {property_label: [wert1, wert2]} result = {} for prop_data in (data.get("query") or {}).get("data") or []: prop_label = prop_data.get("property", "") if prop_label.startswith("_"): # Interne SMW-Properties überspringen continue values = [] for item in prop_data.get("dataitem", []): raw = item.get("item", "") # SMW codiert Werte manchmal als "Wert#0##" → bereinigen clean = raw.split("#")[0].strip() if "#" in raw else raw.strip() if clean: values.append(clean) if values: result[prop_label] = values return result async def ask_query(self, query: str, limit: int = 100) -> list[dict]: """ SMW Ask-API: Semantische Abfrage. Beispiel: query = "[[Kategorie:Übungen]]|?Fokusbereich|?Ziel" Gibt Liste von {title, properties} zurück. """ data = await self._get({ "action": "ask", "query": f"{query}|limit={limit}", }) results = [] for title, props in data.get("query", {}).get("results", {}).items(): entry = {"title": title, "properties": {}} for prop_name, prop_data in props.get("printouts", {}).items(): values = [] for item in prop_data: if isinstance(item, dict): values.append(item.get("fulltext") or item.get("raw") or str(item)) else: values.append(str(item)) entry["properties"][prop_name] = values results.append(entry) return results # ------------------------------------------------------------------ # # Schema-Discovery # # ------------------------------------------------------------------ # async def discover_properties(self, sample_title: str) -> dict: """ Gibt alle SMW-Properties einer Beispielseite zurück. Nützlich um die Property-Namen zu ermitteln bevor der Mapper gebaut wird. """ props = await self.browse_subject(sample_title) logger.info("Properties von '%s': %s", sample_title, list(props.keys())) return props