""" Module: wiki_importer.py Beschreibung: - Importiert alle Übungen aus dem Wiki - Holt Liste aller Übungs-Titel und pageids via `/semantic/pages` - Für jede Übung: * Fetch pageinfo (pageid, fullurl) * Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) * Baut Payload entsprechend Datenmodell * POST an `/exercise` Endpoint - Unterstützt Single-Import via Umgebungsvariable `WIKI_EXERCISE_TITLE` und Full-Import via `--all` Version: 2.0.0 """ import requests import mwparserfromhell import os import sys from typing import Dict, Any, List import argparse # Konfiguration API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") DEFAULT_CATEGORY = os.getenv("WIKI_CATEGORY", "Übungen") DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") # Helper: Holt Liste aller Übungen (Titel-> entry) def fetch_all_pages(category: str) -> Dict[str, Any]: resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}) resp.raise_for_status() return resp.json() # Helper: Holt pageid und fullurl per Core-API def fetch_page_info(title: str) -> Dict[str, Any]: r = requests.get(f"{API_BASE_URL}/info", params={"title": title}) r.raise_for_status() info = r.json() return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} # Parser: Lädt und parst eine Übung def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: print(f"[Parse] {title} (ID={pageid})") resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title}) resp.raise_for_status() wikitext = resp.json().get("wikitext", "") wikicode = mwparserfromhell.parse(wikitext) raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} for tpl in wikicode.filter_templates(): name = tpl.name.strip() if name == "ÜbungInfoBox": for p in tpl.params: raw[p.name.strip()] = str(p.value).strip() elif name == "Übungsbeschreibung": for p in tpl.params: raw[p.name.strip()] = str(p.value).strip() elif name == "SkillDevelopment": raw.setdefault("capabilities", []) cap = str(tpl.get("PrimaryCapability").value).strip() lvl = int(str(tpl.get("CapabilityLevel").value).strip()) raw["capabilities"].append({"capability": cap, "level": lvl}) raw["wikitext"] = wikitext return raw # Ingestion def ingest_exercise(payload: Dict[str, Any]) -> None: title = payload.get("title") resp = requests.post(EXERCISE_API, json=payload) if resp.status_code == 422: print(f"[Error] {title} -> 422: {resp.text}") return resp.raise_for_status() print(f"[Ingest] {title} -> OK") # Build payload def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]: caps_list = raw.get("capabilities", []) capabilities = {c["capability"]: c["level"] for c in caps_list} return { "title": raw.get("title"), "summary": raw.get("Summary", ""), "short_description": raw.get("Summary", ""), "keywords": raw.get("Schlüsselworte", "").split(', '), "link": fullurl, "discipline": raw.get("Übungstyp", ""), "group": raw.get("Gruppengröße", ""), "age_group": raw.get("Altersgruppe", ""), "target_group": raw.get("Zielgruppe", ""), "min_participants": 1, "duration_minutes": float(raw.get("Dauer", 0)), "capabilities": capabilities, "category": category, "purpose": raw.get("Ziel", ""), "execution": raw.get("Durchführung", ""), "notes": raw.get("Hinweise", ""), "preparation": raw.get("RefMethode", ""), "method": raw.get("method", ""), "equipment": raw.get("equipment", []), "fullurl": fullurl } # Main if __name__ == "__main__": parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant") parser.add_argument("--all", action="store_true", help="Import all exercises") parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Single exercise title") parser.add_argument("--category", type=str, default=DEFAULT_CATEGORY, help="Wiki category") args = parser.parse_args() if args.all: pages = fetch_all_pages(args.category) print(f"Found {len(pages)} exercises in category '{args.category}'") for title, entry in pages.items(): pid = entry.get("pageid") if not pid: info = fetch_page_info(title) pid = info.get("pageid") fullurl = info.get("fullurl") else: fullurl = entry.get("fullurl") or fetch_page_info(title)["fullurl"] if not pid: print(f"Skip {title}, no pageid") continue raw = parse_exercise(title, pid) payload = build_payload(raw, fullurl, args.category) ingest_exercise(payload) else: info = fetch_page_info(args.title) pid = info.get("pageid") fullurl = info.get("fullurl") if not pid: print(f"Error: pageid for '{args.title}' not found") sys.exit(1) raw = parse_exercise(args.title, pid) payload = build_payload(raw, fullurl, args.category) ingest_exercise(payload)