137 lines
5.4 KiB
Python
137 lines
5.4 KiB
Python
"""
|
|
Module: wiki_importer.py
|
|
Beschreibung:
|
|
- Importiert alle Übungen aus dem Wiki
|
|
- Holt Liste aller Übungs-Titel und pageids via `/semantic/pages`
|
|
- Für jede Übung:
|
|
* Fetch pageinfo (pageid, fullurl)
|
|
* Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment)
|
|
* Baut Payload entsprechend Datenmodell
|
|
* POST an `/exercise` Endpoint
|
|
- Unterstützt Single-Import via Umgebungsvariable `WIKI_EXERCISE_TITLE` und Full-Import via `--all`
|
|
Version: 2.0.0
|
|
"""
|
|
import requests
|
|
import mwparserfromhell
|
|
import os
|
|
import sys
|
|
from typing import Dict, Any, List
|
|
import argparse
|
|
|
|
# Konfiguration
|
|
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki")
|
|
EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise")
|
|
DEFAULT_CATEGORY = os.getenv("WIKI_CATEGORY", "Übungen")
|
|
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
|
|
|
|
# Helper: Holt Liste aller Übungen (Titel-> entry)
|
|
def fetch_all_pages(category: str) -> Dict[str, Any]:
|
|
resp = requests.get(f"{API_BASE_URL}/semantic/pages", params={"category": category})
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
# Helper: Holt pageid und fullurl per Core-API
|
|
def fetch_page_info(title: str) -> Dict[str, Any]:
|
|
r = requests.get(f"{API_BASE_URL}/info", params={"title": title})
|
|
r.raise_for_status()
|
|
info = r.json()
|
|
return {"pageid": info.get("pageid"), "fullurl": info.get("fullurl")}
|
|
|
|
# Parser: Lädt und parst eine Übung
|
|
def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
|
print(f"[Parse] {title} (ID={pageid})")
|
|
resp = requests.get(f"{API_BASE_URL}/parsepage", params={"pageid": pageid, "title": title})
|
|
resp.raise_for_status()
|
|
wikitext = resp.json().get("wikitext", "")
|
|
wikicode = mwparserfromhell.parse(wikitext)
|
|
|
|
raw: Dict[str, Any] = {"title": title, "source": "MediaWiki", "pageid": pageid}
|
|
for tpl in wikicode.filter_templates():
|
|
name = tpl.name.strip()
|
|
if name == "ÜbungInfoBox":
|
|
for p in tpl.params:
|
|
raw[p.name.strip()] = str(p.value).strip()
|
|
elif name == "Übungsbeschreibung":
|
|
for p in tpl.params:
|
|
raw[p.name.strip()] = str(p.value).strip()
|
|
elif name == "SkillDevelopment":
|
|
raw.setdefault("capabilities", [])
|
|
cap = str(tpl.get("PrimaryCapability").value).strip()
|
|
lvl = int(str(tpl.get("CapabilityLevel").value).strip())
|
|
raw["capabilities"].append({"capability": cap, "level": lvl})
|
|
raw["wikitext"] = wikitext
|
|
return raw
|
|
|
|
# Ingestion
|
|
def ingest_exercise(payload: Dict[str, Any]) -> None:
|
|
title = payload.get("title")
|
|
resp = requests.post(EXERCISE_API, json=payload)
|
|
if resp.status_code == 422:
|
|
print(f"[Error] {title} -> 422: {resp.text}")
|
|
return
|
|
resp.raise_for_status()
|
|
print(f"[Ingest] {title} -> OK")
|
|
|
|
# Build payload
|
|
def build_payload(raw: Dict[str, Any], fullurl: str, category: str) -> Dict[str, Any]:
|
|
caps_list = raw.get("capabilities", [])
|
|
capabilities = {c["capability"]: c["level"] for c in caps_list}
|
|
return {
|
|
"title": raw.get("title"),
|
|
"summary": raw.get("Summary", ""),
|
|
"short_description": raw.get("Summary", ""),
|
|
"keywords": raw.get("Schlüsselworte", "").split(', '),
|
|
"link": fullurl,
|
|
"discipline": raw.get("Übungstyp", ""),
|
|
"group": raw.get("Gruppengröße", ""),
|
|
"age_group": raw.get("Altersgruppe", ""),
|
|
"target_group": raw.get("Zielgruppe", ""),
|
|
"min_participants": 1,
|
|
"duration_minutes": float(raw.get("Dauer", 0)),
|
|
"capabilities": capabilities,
|
|
"category": category,
|
|
"purpose": raw.get("Ziel", ""),
|
|
"execution": raw.get("Durchführung", ""),
|
|
"notes": raw.get("Hinweise", ""),
|
|
"preparation": raw.get("RefMethode", ""),
|
|
"method": raw.get("method", ""),
|
|
"equipment": raw.get("equipment", []),
|
|
"fullurl": fullurl
|
|
}
|
|
|
|
# Main
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Import exercises from Wiki to Qdrant")
|
|
parser.add_argument("--all", action="store_true", help="Import all exercises")
|
|
parser.add_argument("--title", type=str, default=DEFAULT_TITLE, help="Single exercise title")
|
|
parser.add_argument("--category", type=str, default=DEFAULT_CATEGORY, help="Wiki category")
|
|
args = parser.parse_args()
|
|
|
|
if args.all:
|
|
pages = fetch_all_pages(args.category)
|
|
print(f"Found {len(pages)} exercises in category '{args.category}'")
|
|
for title, entry in pages.items():
|
|
pid = entry.get("pageid")
|
|
if not pid:
|
|
info = fetch_page_info(title)
|
|
pid = info.get("pageid")
|
|
fullurl = info.get("fullurl")
|
|
else:
|
|
fullurl = entry.get("fullurl") or fetch_page_info(title)["fullurl"]
|
|
if not pid:
|
|
print(f"Skip {title}, no pageid")
|
|
continue
|
|
raw = parse_exercise(title, pid)
|
|
payload = build_payload(raw, fullurl, args.category)
|
|
ingest_exercise(payload)
|
|
else:
|
|
info = fetch_page_info(args.title)
|
|
pid = info.get("pageid")
|
|
fullurl = info.get("fullurl")
|
|
if not pid:
|
|
print(f"Error: pageid for '{args.title}' not found")
|
|
sys.exit(1)
|
|
raw = parse_exercise(args.title, pid)
|
|
payload = build_payload(raw, fullurl, args.category)
|
|
ingest_exercise(payload)
|