Trainer_LLM/scripts/wiki_importer2.0.0.py

"""
Module: wiki_importer.py
Beschreibung:
- Importiert alle Übungen aus dem Wiki
- Holt Liste aller Übungs-Titel und pageids via `/semantic/pages`
- Für jede Übung:
  * Fetch pageinfo (pageid, fullurl)
  * Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment)
  * Baut Payload entsprechend Datenmodell
  * POST an `/exercise` Endpoint
- Unterstützt Single-Import via Umgebungsvariable `WIKI_EXERCISE_TITLE` und Full-Import via `--all`
Version: 2.0.0
"""
import requests
import mwparserfromhell
import os
import sys
from typing import Dict, Any, List
import argparse

# Konfiguration
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki")
EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise")
DEFAULT_CATEGORY = os.getenv("WIKI_CATEGORY", "Übungen")
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")

# Helper: Holt Liste aller Übungen (Titel-> entry)
def fetch_all_pages(category: str) -> Dict[str, Any]:
    resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category})
    resp.raise_for_status()
    return resp.json()

# Helper: Holt pageid und fullurl per Core-API
def fetch_page_info(title: str) -> Dict[str, Any]:
    r = requests.get(f"{API_BASE_URL}/info", params={"title": title})
    r.raise_for_status()
    info = r.json()
    return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}

# Parser: Lädt und parst eine Übung
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
    print(f"[Parse] {title} (ID={pageid})")
    resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title})
    resp.raise_for_status()
    wikitext = resp.json().get("wikitext", "")
    wikicode = mwparserfromhell.parse(wikitext)

    raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
    for tpl in wikicode.filter_templates():
        name = tpl.name.strip()
        if name == "ÜbungInfoBox":
            for p in tpl.params:
                raw[p.name.strip()] = str(p.value).strip()
        elif name == "Übungsbeschreibung":
            for p in tpl.params:
                raw[p.name.strip()] = str(p.value).strip()
        elif name == "SkillDevelopment":
            raw.setdefault("capabilities", [])
            cap = str(tpl.get("PrimaryCapability").value).strip()
            lvl = int(str(tpl.get("CapabilityLevel").value).strip())
            raw["capabilities"].append({"capability": cap, "level": lvl})
    raw["wikitext"] = wikitext
    return raw

# Ingestion
def ingest_exercise(payload: Dict[str, Any]) -> None:
    title = payload.get("title")
    resp = requests.post(EXERCISE_API, json=payload)
    if resp.status_code == 422:
        print(f"[Error] {title} -> 422: {resp.text}")
        return
    resp.raise_for_status()
    print(f"[Ingest] {title} -> OK")

# Build payload
def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]:
    caps_list = raw.get("capabilities", [])
    capabilities = {c["capability"]: c["level"] for c in caps_list}
    return {
        "title": raw.get("title"),
        "summary": raw.get("Summary", ""),
        "short_description": raw.get("Summary", ""),
        "keywords": raw.get("Schlüsselworte", "").split(', '),
        "link": fullurl,
        "discipline": raw.get("Übungstyp", ""),
        "group": raw.get("Gruppengröße", ""),
        "age_group": raw.get("Altersgruppe", ""),
        "target_group": raw.get("Zielgruppe", ""),
        "min_participants": 1,
        "duration_minutes": float(raw.get("Dauer", 0)),
        "capabilities": capabilities,
        "category": category,
        "purpose": raw.get("Ziel", ""),
        "execution": raw.get("Durchführung", ""),
        "notes": raw.get("Hinweise", ""),
        "preparation": raw.get("RefMethode", ""),
        "method": raw.get("method", ""),
        "equipment": raw.get("equipment", []),
        "fullurl": fullurl
    }

# Main
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant")
    parser.add_argument("--all", action="store_true", help="Import all exercises")
    parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Single exercise title")
    parser.add_argument("--category", type=str, default=DEFAULT_CATEGORY, help="Wiki category")
    args = parser.parse_args()

    if args.all:
        pages = fetch_all_pages(args.category)
        print(f"Found {len(pages)} exercises in category '{args.category}'")
        for title, entry in pages.items():
            pid = entry.get("pageid")
            if not pid:
                info = fetch_page_info(title)
                pid = info.get("pageid")
                fullurl = info.get("fullurl")
            else:
                fullurl = entry.get("fullurl") or fetch_page_info(title)["fullurl"]
            if not pid:
                print(f"Skip {title}, no pageid")
                continue
            raw = parse_exercise(title, pid)
            payload = build_payload(raw, fullurl, args.category)
            ingest_exercise(payload)
    else:
        info = fetch_page_info(args.title)
        pid = info.get("pageid")
        fullurl = info.get("fullurl")
        if not pid:
            print(f"Error: pageid for '{args.title}' not found")
            sys.exit(1)
        raw = parse_exercise(args.title, pid)
        payload = build_payload(raw, fullurl, args.category)
        ingest_exercise(payload)