""" Module: wiki_importer.py Beschreibung: - Importiert zunächst nur eine Übung aus dem Wiki - Liest Wikitext einer Übung aus - Parsen mit mwparserfromhell - Extrahiert Felder aus Templates: * ÜbungInfoBox * Übungsbeschreibung * SkillDevelopment (mehrfach) - Baut ein Exercise-Objekt zusammen - Speichert per POST /exercise Endpoint in Qdrant - Detailliertes Error-Logging für 422 und allgemeine Fehler Version: 1.1.2 """ import requests import mwparserfromhell import os import sys from typing import Dict, Any # Konfiguration über Umgebungsvariablen API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Übungstitel, der importiert werden soll TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") # Helper: Holt pageid und fullurl per Core-API def fetch_page_info(title: str) -> Dict[str, Any]: r = requests.get(f"{API_BASE_URL}/info", params={"title": title}) r.raise_for_status() info = r.json() return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")} # Parser: Lädt und parst eine Übung def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: print(f"[Parse] Loading '{title}' (ID={pageid})") resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title}) resp.raise_for_status() wikitext = resp.json().get("wikitext", "") wikicode = mwparserfromhell.parse(wikitext) data: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid} for tpl in wikicode.filter_templates(): name = tpl.name.strip() if name == "ÜbungInfoBox": for param in tpl.params: data[param.name.strip()] = str(param.value).strip() elif name == "Übungsbeschreibung": for param in tpl.params: data[param.name.strip()] = str(param.value).strip() elif name == "SkillDevelopment": data.setdefault("capabilities", []) primary = str(tpl.get("PrimaryCapability").value).strip() level = int(str(tpl.get("CapabilityLevel").value).strip()) data["capabilities"].append({"capability": primary, "level": level}) data["wikitext"] = wikitext return data # Ingestion: Sendet einen Datensatz an Qdrant mit detailliertem Error-Logging def ingest_exercise(ex_data: Dict[str, Any]) -> None: title = ex_data.get("title") try: resp = requests.post(EXERCISE_API, json=ex_data) if resp.status_code == 422: print(f"[Ingest] '{title}' -> FAILED 422:") print(resp.text) resp.raise_for_status() print(f"[Ingest] '{title}' -> OK") except requests.HTTPError as e: msg = resp.text if 'resp' in locals() else str(e) print(f"[Ingest] '{title}' -> HTTPError: {e} - {msg}") except Exception as e: print(f"[Ingest] '{title}' -> FAILED: {e}") # Main: Einmaliger Import für TITLE if __name__ == "__main__": print(f"[Main] Import single exercise: {TITLE}") try: info = fetch_page_info(TITLE) pageid = info.get("pageid") fullurl = info.get("fullurl") if not pageid: print(f"Error: pageid für '{TITLE}' nicht gefunden.") sys.exit(1) raw = parse_exercise(TITLE, pageid) # capabilities als Dict wandeln caps_list = raw.get("capabilities", []) capabilities = {c["capability"]: c["level"] for c in caps_list} # Payload entsprechend Datenmodell exercise_payload = { "title": raw.get("title"), "summary": raw.get("Summary", ""), "short_description": raw.get("Summary", ""), "keywords": raw.get("Schlüsselworte", "").split(', '), "link": fullurl, "discipline": raw.get("Übungstyp", ""), "group": raw.get("Gruppengröße", ""), "age_group": raw.get("Altersgruppe", ""), "target_group": raw.get("Zielgruppe", ""), "min_participants": 1, "duration_minutes": float(raw.get("Dauer", 0)), "capabilities": capabilities, "category": raw.get("category", "Übungen"), "purpose": raw.get("Ziel", ""), "execution": raw.get("Durchführung", ""), "notes": raw.get("Hinweise", ""), "preparation": raw.get("RefMethode", ""), "method": raw.get("method", ""), "equipment": raw.get("equipment", []), "fullurl": fullurl } ingest_exercise(exercise_payload) except Exception as e: print(f"Fatal error: {e}") sys.exit(1)