All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
206 lines
7.4 KiB
Python
206 lines
7.4 KiB
Python
"""
|
||
File: wiki_router.py
|
||
Beschreibung:
|
||
- Endpunkte für MediaWiki-Integration im lokalen Netzwerk.
|
||
- Funktionen:
|
||
* /health: Prüft Verfügbarkeit der MediaWiki-API.
|
||
* /login: Führt clientlogin durch und speichert Session-Cookies.
|
||
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
|
||
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
|
||
* /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling
|
||
* /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404
|
||
Version: 1.3.0
|
||
"""
|
||
|
||
# HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT.
|
||
# Markierungen: # NEW / # CHANGED
|
||
|
||
from typing import Dict, Any, Optional
|
||
from dataclasses import dataclass
|
||
from fastapi import APIRouter, HTTPException, Query
|
||
from pydantic import BaseModel
|
||
import os
|
||
import requests
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv()
|
||
|
||
router = APIRouter(prefix="/import/wiki", tags=["wiki"])
|
||
|
||
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php")
|
||
|
||
# Session für Cookies (Login)
|
||
wiki_session = requests.Session()
|
||
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"})
|
||
|
||
# =====================
|
||
# Schemas
|
||
# =====================
|
||
class WikiLoginRequest(BaseModel):
|
||
username: str
|
||
password: str
|
||
|
||
class WikiLoginResponse(BaseModel):
|
||
status: str
|
||
message: Optional[str] = None
|
||
|
||
class PageInfoResponse(BaseModel):
|
||
pageid: int
|
||
title: str
|
||
fullurl: str
|
||
|
||
class PageContentResponse(BaseModel):
|
||
pageid: int
|
||
title: str
|
||
wikitext: str
|
||
|
||
# =====================
|
||
# Helpers
|
||
# =====================
|
||
# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich)
|
||
_dash_variants = ("-", "–", "—")
|
||
|
||
def _normalize_titles(title: str):
|
||
yield title
|
||
# Leerzeichen → Unterstrich
|
||
if " " in title:
|
||
yield title.replace(" ", "_")
|
||
# Gedankenstriche ↔ Bindestrich
|
||
for dv in _dash_variants:
|
||
for dv2 in _dash_variants:
|
||
if dv != dv2 and dv in title:
|
||
yield title.replace(dv, dv2)
|
||
|
||
# NEW: Robustes Pageinfo (None wenn nicht gefunden)
|
||
def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]:
|
||
params = {
|
||
"action": "query",
|
||
"format": "json",
|
||
"prop": "info",
|
||
"inprop": "url",
|
||
"redirects": 1, # folgt Weiterleitungen
|
||
}
|
||
for candidate in _normalize_titles(title):
|
||
try:
|
||
r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10)
|
||
r.raise_for_status()
|
||
except Exception as e:
|
||
# Upstream gestört → 502 (aber nicht für nächste Candidate blockieren)
|
||
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
|
||
pages = r.json().get("query", {}).get("pages", {}) or {}
|
||
if not isinstance(pages, dict) or not pages:
|
||
continue
|
||
# MediaWiki liefert dict {pageid(str): {..}}
|
||
pid_str, page = next(iter(pages.items()))
|
||
# Missing?
|
||
if page.get("missing") is not None or str(pid_str) == "-1":
|
||
continue
|
||
title_out = page.get("title") or candidate
|
||
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
|
||
try:
|
||
pid = int(pid_str)
|
||
except ValueError:
|
||
pid = int(page.get("pageid", -1))
|
||
return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl)
|
||
return None
|
||
|
||
# =====================
|
||
# Endpoints
|
||
# =====================
|
||
@router.get("/health")
|
||
def health() -> Dict[str, str]:
|
||
try:
|
||
r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5)
|
||
r.raise_for_status()
|
||
except Exception as e:
|
||
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
|
||
return {"status": "ok"}
|
||
|
||
@router.post("/login", response_model=WikiLoginResponse)
|
||
def login(data: WikiLoginRequest):
|
||
# Token holen
|
||
try:
|
||
token_resp = wiki_session.get(
|
||
WIKI_API_URL,
|
||
params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
|
||
timeout=10,
|
||
)
|
||
token_resp.raise_for_status()
|
||
token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken")
|
||
except Exception as e:
|
||
raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
|
||
if not token:
|
||
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
|
||
|
||
# clientlogin
|
||
try:
|
||
login_resp = wiki_session.post(
|
||
WIKI_API_URL,
|
||
data={
|
||
"action": "clientlogin",
|
||
"format": "json",
|
||
"username": data.username,
|
||
"password": data.password,
|
||
"logintoken": token,
|
||
},
|
||
timeout=15,
|
||
)
|
||
login_resp.raise_for_status()
|
||
status = login_resp.json().get("clientlogin", {}).get("status")
|
||
if status != "PASS":
|
||
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {status}")
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
raise HTTPException(status_code=502, detail=f"Login-Error: {e}")
|
||
return WikiLoginResponse(status="success")
|
||
|
||
@router.get("/semantic/pages")
|
||
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
|
||
smw_query = f"[[Category:{category}]]"
|
||
ask_query = f"{smw_query}|limit=50000"
|
||
try:
|
||
r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30)
|
||
r.raise_for_status()
|
||
except Exception as e:
|
||
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
|
||
return r.json().get("query", {}).get("results", {})
|
||
|
||
@router.get("/parsepage", response_model=PageContentResponse)
|
||
def parse_page(pageid: int = Query(...), title: str = Query(None)):
|
||
try:
|
||
r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20)
|
||
r.raise_for_status()
|
||
except Exception as e:
|
||
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
|
||
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
|
||
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
|
||
|
||
# CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln
|
||
@router.get("/info", response_model=PageInfoResponse)
|
||
def page_info(title: str = Query(..., description="Seitentitel")):
|
||
result = _fetch_pageinfo_by_title(title)
|
||
if not result:
|
||
# sauberes 404 statt StopIteration/500
|
||
raise HTTPException(status_code=404, detail=f"Page not found: {title}")
|
||
return result
|
||
|
||
# CHANGED: /semantic/page propagiert 404 sauber weiter
|
||
@router.get("/semantic/page")
|
||
def semantic_page(title: str = Query(...)) -> Dict[str, Any]:
|
||
# SMW-Printouts beschaffen
|
||
entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen
|
||
entry = entries.get(title)
|
||
if not entry:
|
||
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
|
||
# Pageinfo & Wikitext holen
|
||
info = page_info(title=title) # gibt 404 wenn unbekannt
|
||
parsed = parse_page(pageid=info.pageid, title=title)
|
||
return {
|
||
"title": title,
|
||
"pageid": info.pageid,
|
||
"fullurl": info.fullurl,
|
||
"printouts": entry.get("printouts", {}),
|
||
"wikitext": parsed.wikitext,
|
||
}
|