feat: add recursive subcategory search for MediaWiki import
Issue: Only 2 exercises found instead of 200+
Root cause: Exercises likely organized in subcategories
Solution:
- Added recursive parameter to get_category_members()
- New _get_subcategories() helper method
- Recursively traverses all subcategories
- Logs subcategory counts for debugging
Behavior:
- get_category_members('Übungen') now finds:
1. All pages directly in 'Kategorie:Übungen'
2. All subcategories (e.g. 'Kihon', 'Kata', 'Kumite')
3. All pages in those subcategories (recursive)
Example structure:
Kategorie:Übungen
├─ Seite: Übung A (2 direkte)
├─ Kategorie:Kihon
│ ├─ Seite: Mae-Geri
│ └─ Seite: Gyaku-Zuki
└─ Kategorie:Kata
└─ Seite: Heian Shodan
This commit is contained in:
parent
9d041aaf4f
commit
8b51864b53
|
|
@ -83,9 +83,9 @@ class SmwClient:
|
|||
# Kategorien #
|
||||
# ------------------------------------------------------------------ #
|
||||
|
||||
async def get_category_members(self, category: str, limit: int = 500) -> list[dict]:
|
||||
async def get_category_members(self, category: str, limit: int = 500, recursive: bool = True) -> list[dict]:
|
||||
"""
|
||||
Gibt alle Seiten einer Kategorie zurück.
|
||||
Gibt alle Seiten einer Kategorie zurück (optional rekursiv durch Unterkategorien).
|
||||
Gibt Liste von {"pageid": int, "title": str} zurück.
|
||||
"""
|
||||
members = []
|
||||
|
|
@ -97,7 +97,7 @@ class SmwClient:
|
|||
"list": "categorymembers",
|
||||
"cmtitle": f"Kategorie:{category}",
|
||||
"cmlimit": min(limit, 500),
|
||||
"cmtype": "page",
|
||||
"cmtype": "page", # Nur Seiten, keine Unterkategorien
|
||||
"cmprop": "ids|title",
|
||||
}
|
||||
if cmcontinue:
|
||||
|
|
@ -111,8 +111,47 @@ class SmwClient:
|
|||
else:
|
||||
break
|
||||
|
||||
# Rekursiv durch Unterkategorien gehen
|
||||
if recursive:
|
||||
subcats = await self._get_subcategories(category)
|
||||
logger.info(f"Kategorie '{category}': {len(members)} direkte Seiten, {len(subcats)} Unterkategorien")
|
||||
|
||||
for subcat in subcats:
|
||||
if len(members) >= limit:
|
||||
break
|
||||
subcat_name = subcat["title"].replace("Kategorie:", "")
|
||||
subcat_members = await self.get_category_members(subcat_name, limit=limit - len(members), recursive=True)
|
||||
members.extend(subcat_members)
|
||||
|
||||
return members[:limit]
|
||||
|
||||
async def _get_subcategories(self, category: str) -> list[dict]:
|
||||
"""Gibt alle Unterkategorien einer Kategorie zurück."""
|
||||
subcats = []
|
||||
cmcontinue = None
|
||||
|
||||
while True:
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "categorymembers",
|
||||
"cmtitle": f"Kategorie:{category}",
|
||||
"cmlimit": 500,
|
||||
"cmtype": "subcat", # Nur Unterkategorien
|
||||
"cmprop": "ids|title",
|
||||
}
|
||||
if cmcontinue:
|
||||
params["cmcontinue"] = cmcontinue
|
||||
|
||||
data = await self._get(params)
|
||||
subcats.extend(data["query"]["categorymembers"])
|
||||
|
||||
if "continue" in data:
|
||||
cmcontinue = data["continue"].get("cmcontinue")
|
||||
else:
|
||||
break
|
||||
|
||||
return subcats
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# Seiteninhalte #
|
||||
# ------------------------------------------------------------------ #
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user