#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Module: wiki_importer.py Beschreibung: - Importiert Übungen aus dem MediaWiki via FastAPI wiki_router - Führt vor dem Import einen Login gegen /import/wiki/login durch (falls nicht via --skip-login deaktiviert) - Holt Liste aller Übungs-Titel (SMW-Ask) via `/semantic/pages` - Für jede Übung: * Fetch pageinfo (pageid, fullurl) via `/info` * Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) via `/parsepage` * Baut Payload entsprechend Exercise-Datenmodell * POST an `/exercise` Endpoint (exercise_router) - Unterstützt Single-Import via `--title` (oder ENV `WIKI_EXERCISE_TITLE`) und Full-Import via `--all` - Optional: Credentials via CLI (--username/--password) oder `.env` (WIKI_BOT_USER / WIKI_BOT_PASSWORD) Version: 2.1.0 """ import os import sys import argparse from typing import Dict, Any import requests import mwparserfromhell from dotenv import load_dotenv # ----- Konfiguration / Defaults ----- load_dotenv() # .env laden, falls vorhanden API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") # FastAPI-Wiki-Proxy EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen") DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") # ---- Hilfsfunktionen für Wiki-Router ---- def wiki_health() -> None: r = requests.get(f"{API_BASE_URL}/health", timeout=15) r.raise_for_status() print("[Sanity] Wiki health OK") def wiki_login(username: str, password: str) -> None: """ Führt einen Login gegen den wiki_router durch. Erwartet: {"status":"success"} bei Erfolg. """ payload = {"username": username, "password": password} r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30) # kein raise_for_status(), wir wollen die JSON-Fehler sauber ausgeben try: data = r.json() except Exception: print(f"[Login] HTTP {r.status_code}: {r.text}") r.raise_for_status() status = (data or {}).get("status") if status != "success": msg = (data or {}).get("message", "Login fehlgeschlagen") raise RuntimeError(f"[Login] {msg}") print("[Login] success") def fetch_all_pages(category: str) -> Dict[str, Any]: resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category}, timeout=60) resp.raise_for_status() return resp.json() def fetch_page_info(title: str) -> Dict[str, Any]: r = requests.get(f"{API_BASE_URL}/info", params={"title": title}, timeout=30) r.raise_for_status() info = r.json() return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: print(f"[Parse] Lade '{title}' (ID={pageid})") resp = requests.get( f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title}, timeout=60 ) resp.raise_for_status() wikitext = resp.json().get("wikitext", "") wikicode = mwparserfromhell.parse(wikitext) raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} for tpl in wikicode.filter_templates(): name = str(tpl.name).strip() if name == "ÜbungInfoBox": for p in tpl.params: raw[str(p.name).strip()] = str(p.value).strip() elif name == "Übungsbeschreibung": for p in tpl.params: raw[str(p.name).strip()] = str(p.value).strip() elif name == "SkillDevelopment": raw.setdefault("capabilities", []) try: cap = str(tpl.get("PrimaryCapability").value).strip() except Exception: cap = "" try: lvl = int(str(tpl.get("CapabilityLevel").value).strip()) except Exception: lvl = 0 if cap: raw["capabilities"].append({"capability": cap, "level": lvl}) raw["wikitext"] = wikitext return raw def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]: # Exercise.capabilities erwartet Dict[str,int] caps_list = raw.get("capabilities", []) capabilities = {} for c in caps_list: cap = c.get("capability") lvl = c.get("level") if isinstance(cap, str) and cap: try: capabilities[cap] = int(lvl) except Exception: pass # Defaults/Fallbacks duration = 0.0 try: duration = float(raw.get("Dauer", 0) or 0) except Exception: duration = 0.0 keywords = [] kw_raw = raw.get("Schlüsselworte", "") if isinstance(kw_raw, str): keywords = [k.strip() for k in kw_raw.split(",") if k.strip()] equipment = [] eq_raw = raw.get("equipment", []) if isinstance(eq_raw, str): equipment = [e.strip() for e in eq_raw.split(",") if e.strip()] elif isinstance(eq_raw, list): equipment = [str(e).strip() for e in eq_raw if str(e).strip()] payload: Dict[str, Any] = { "title": raw.get("title") or "", "summary": raw.get("Summary", "") or "", "short_description": raw.get("Summary", "") or "", "keywords": keywords, "link": fullurl or "", "discipline": raw.get("Übungstyp", "") or "", "group": raw.get("Gruppengröße", "") or None, "age_group": raw.get("Altersgruppe", "") or "", "target_group": raw.get("Zielgruppe", "") or "", "min_participants": 1, "duration_minutes": int(round(duration)), # Exercise erwartet int "capabilities": capabilities, "category": category or "", "purpose": raw.get("Ziel", "") or "", "execution": raw.get("Durchführung", "") or "", "notes": raw.get("Hinweise", "") or "", "preparation": raw.get("RefMethode", "") or "", "method": raw.get("method", "") or "", # falls im Wikitext vorhanden "equipment": equipment, "fullurl": fullurl or "", # optionales Feld # Idempotenz (optional nutzbar in exercise_router): "external_id": f"wiki:{raw.get('pageid')}", "source": "MediaWiki" } return payload def ingest_exercise(payload: Dict[str, Any]) -> None: title = payload.get("title", "") resp = requests.post(EXERCISE_API, json=payload, timeout=60) if resp.status_code == 422: print(f"[Ingest] '{title}' -> FAILED 422:\n{resp.text}") try: resp.raise_for_status() except Exception: pass return resp.raise_for_status() print(f"[Ingest] '{title}' -> OK") # ----- Main ----- def main() -> None: parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant (via FastAPI wiki_router)") parser.add_argument("--all", action="store_true", help="Alle Übungen importieren (SMW-Ask)") parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Einzelimport eines Übungstitels") parser.add_argument("--category", type=str, default=DEFAULT_CAT, help="Wiki-Kategorie (z.B. 'Übungen')") parser.add_argument("--username", type=str, default=os.getenv("WIKI_BOT_USER"), help="Wiki-Login Benutzer (überschreibt .env)") parser.add_argument("--password", type=str, default=os.getenv("WIKI_BOT_PASSWORD"), help="Wiki-Login Passwort (überschreibt .env)") parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)") args = parser.parse_args() # Sanity wiki_health() # Login (sofern nicht explizit übersprungen) if not args.skip_login: if not args.username or not args.password: print("[Login] Fehler: fehlende Credentials. Setze .env (WIKI_BOT_USER/WIKI_BOT_PASSWORD) oder CLI --username/--password.", file=sys.stderr) sys.exit(1) try: wiki_login(args.username, args.password) except Exception as e: print(str(e), file=sys.stderr) sys.exit(1) # Einzel- oder Vollimport if args.all: print(f"[Main] Lade Liste der Übungen aus Kategorie '{args.category}'…") pages = fetch_all_pages(args.category) print(f"[Main] {len(pages)} Seiten gefunden.") for title, entry in pages.items(): pid = entry.get("pageid") fullurl = entry.get("fullurl") if not pid: # Core-Info nachschlagen info = fetch_page_info(title) pid = info.get("pageid") fullurl = fullurl or info.get("fullurl") if not pid: print(f"[Skip] '{title}' hat keine pageid") continue raw = parse_exercise(title, pid) payload = build_payload(raw, fullurl or "", args.category) ingest_exercise(payload) else: print(f"[Main] Import single exercise: {args.title}") info = fetch_page_info(args.title) pid = info.get("pageid") fullurl = info.get("fullurl") or "" if not pid: print(f"[Error] pageid für '{args.title}' nicht gefunden.", file=sys.stderr) sys.exit(1) raw = parse_exercise(args.title, pid) payload = build_payload(raw, fullurl, args.category) ingest_exercise(payload) if __name__ == "__main__": main()