Trainer_LLM/scripts/wiki_importer1.1.0.py

"""
Module: wiki_importer.py
Beschreibung:
- Importiert zunächst nur eine Übung aus dem Wiki
- Liest Wikitext einer Übung aus
- Parsen mit mwparserfromhell
- Extrahiert Felder aus Templates:
  * ÜbungInfoBox
  * Übungsbeschreibung
  * SkillDevelopment (mehrfach)
- Baut ein Exercise-Objekt zusammen
- Speichert per POST /exercise Endpoint in Qdrant
- Detailliertes Error-Logging für 422 und allgemeine Fehler
Version: 1.1.2
"""
import requests
import mwparserfromhell
import os
import sys
from typing import Dict, Any

# Konfiguration über Umgebungsvariablen
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki")
EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise")
# Übungstitel, der importiert werden soll
TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")

# Helper: Holt pageid und fullurl per Core-API
def fetch_page_info(title: str) -> Dict[str, Any]:
    r = requests.get(f"{API_BASE_URL}/info", params={"title": title})
    r.raise_for_status()
    info = r.json()
    return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}

# Parser: Lädt und parst eine Übung
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
    print(f"[Parse] Loading '{title}' (ID={pageid})")
    resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title})
    resp.raise_for_status()
    wikitext = resp.json().get("wikitext", "")

    wikicode = mwparserfromhell.parse(wikitext)
    data: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}

    for tpl in wikicode.filter_templates():
        name = tpl.name.strip()
        if name == "ÜbungInfoBox":
            for param in tpl.params:
                data[param.name.strip()] = str(param.value).strip()
        elif name == "Übungsbeschreibung":
            for param in tpl.params:
                data[param.name.strip()] = str(param.value).strip()
        elif name == "SkillDevelopment":
            data.setdefault("capabilities", [])
            primary = str(tpl.get("PrimaryCapability").value).strip()
            level = int(str(tpl.get("CapabilityLevel").value).strip())
            data["capabilities"].append({"capability": primary, "level": level})
    data["wikitext"] = wikitext
    return data

# Ingestion: Sendet einen Datensatz an Qdrant mit detailliertem Error-Logging
def ingest_exercise(ex_data: Dict[str, Any]) -> None:
    title = ex_data.get("title")
    try:
        resp = requests.post(EXERCISE_API, json=ex_data)
        if resp.status_code == 422:
            print(f"[Ingest] '{title}' -> FAILED 422:")
            print(resp.text)
        resp.raise_for_status()
        print(f"[Ingest] '{title}' -> OK")
    except requests.HTTPError as e:
        msg = resp.text if 'resp' in locals() else str(e)
        print(f"[Ingest] '{title}' -> HTTPError: {e} - {msg}")
    except Exception as e:
        print(f"[Ingest] '{title}' -> FAILED: {e}")

# Main: Einmaliger Import für TITLE
if __name__ == "__main__":
    print(f"[Main] Import single exercise: {TITLE}")
    try:
        info = fetch_page_info(TITLE)
        pageid = info.get("pageid")
        fullurl = info.get("fullurl")
        if not pageid:
            print(f"Error: pageid für '{TITLE}' nicht gefunden.")
            sys.exit(1)
        raw = parse_exercise(TITLE, pageid)
        # capabilities als Dict wandeln
        caps_list = raw.get("capabilities", [])
        capabilities = {c["capability"]: c["level"] for c in caps_list}
        # Payload entsprechend Datenmodell
        exercise_payload = {
            "title": raw.get("title"),
            "summary": raw.get("Summary", ""),
            "short_description": raw.get("Summary", ""),
            "keywords": raw.get("Schlüsselworte", "").split(', '),
            "link": fullurl,
            "discipline": raw.get("Übungstyp", ""),
            "group": raw.get("Gruppengröße", ""),
            "age_group": raw.get("Altersgruppe", ""),
            "target_group": raw.get("Zielgruppe", ""),
            "min_participants": 1,
            "duration_minutes": float(raw.get("Dauer", 0)),
            "capabilities": capabilities,
            "category": raw.get("category", "Übungen"),
            "purpose": raw.get("Ziel", ""),
            "execution": raw.get("Durchführung", ""),
            "notes": raw.get("Hinweise", ""),
            "preparation": raw.get("RefMethode", ""),
            "method": raw.get("method", ""),
            "equipment": raw.get("equipment", []),
            "fullurl": fullurl
        }
        ingest_exercise(exercise_payload)
    except Exception as e:
        print(f"Fatal error: {e}")
        sys.exit(1)