From 8b51864b53424fc8ddfbf3b2a7689d847ec84cc8 Mon Sep 17 00:00:00 2001 From: Lars Date: Fri, 24 Apr 2026 17:53:11 +0200 Subject: [PATCH] feat: add recursive subcategory search for MediaWiki import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue: Only 2 exercises found instead of 200+ Root cause: Exercises likely organized in subcategories Solution: - Added recursive parameter to get_category_members() - New _get_subcategories() helper method - Recursively traverses all subcategories - Logs subcategory counts for debugging Behavior: - get_category_members('Übungen') now finds: 1. All pages directly in 'Kategorie:Übungen' 2. All subcategories (e.g. 'Kihon', 'Kata', 'Kumite') 3. All pages in those subcategories (recursive) Example structure: Kategorie:Übungen ├─ Seite: Übung A (2 direkte) ├─ Kategorie:Kihon │ ├─ Seite: Mae-Geri │ └─ Seite: Gyaku-Zuki └─ Kategorie:Kata └─ Seite: Heian Shodan --- backend/smw_client.py | 45 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/backend/smw_client.py b/backend/smw_client.py index 80f36a7..7390c17 100644 --- a/backend/smw_client.py +++ b/backend/smw_client.py @@ -83,9 +83,9 @@ class SmwClient: # Kategorien # # ------------------------------------------------------------------ # - async def get_category_members(self, category: str, limit: int = 500) -> list[dict]: + async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]: """ - Gibt alle Seiten einer Kategorie zurück. + Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien). Gibt Liste von {"pageid": int, "title": str} zurück. """ members = [] @@ -97,7 +97,7 @@ class SmwClient: "list": "categorymembers", "cmtitle": f"Kategorie:{category}", "cmlimit": min(limit, 500), - "cmtype": "page", + "cmtype": "page", # Nur Seiten, keine Unterkategorien "cmprop": "ids|title", } if cmcontinue: @@ -111,8 +111,47 @@ class SmwClient: else: break + # Rekursiv durch Unterkategorien gehen + if recursive: + subcats = await self._get_subcategories(category) + logger.info(f"Kategorie '{category}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien") + + for subcat in subcats: + if len(members) >= limit: + break + subcat_name = subcat["title"].replace("Kategorie:", "") + subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True) + members.extend(subcat_members) + return members[:limit] + async def _get_subcategories(self, category: str) -> list[dict]: + """Gibt alle Unterkategorien einer Kategorie zurück.""" + subcats = [] + cmcontinue = None + + while True: + params = { + "action": "query", + "list": "categorymembers", + "cmtitle": f"Kategorie:{category}", + "cmlimit": 500, + "cmtype": "subcat", # Nur Unterkategorien + "cmprop": "ids|title", + } + if cmcontinue: + params["cmcontinue"] = cmcontinue + + data = await self._get(params) + subcats.extend(data["query"]["categorymembers"]) + + if "continue" in data: + cmcontinue = data["continue"].get("cmcontinue") + else: + break + + return subcats + # ------------------------------------------------------------------ # # Seiteninhalte # # ------------------------------------------------------------------ #