Trainer_LLM/llm-api/wiki_router.py
Lars d9abcb3ef4
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
llm-api/wiki_router.py aktualisiert
2025-08-11 11:01:23 +02:00

230 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
File: wiki_router.py
Beschreibung:
- Endpunkte für MediaWiki-Integration im lokalen Netzwerk.
- Funktionen:
* /health: Prüft Verfügbarkeit der MediaWiki-API.
* /login: Führt clientlogin durch und speichert Session-Cookies.
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
* /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling
* /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404
Version: 1.3.0
"""
# HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT.
# Markierungen: # NEW / # CHANGED
from typing import Dict, Any, Optional
from dataclasses import dataclass
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
import os
import requests
from dotenv import load_dotenv
load_dotenv()
router = APIRouter(prefix="/import/wiki", tags=["wiki"])
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php")
# Session für Cookies (Login)
wiki_session = requests.Session()
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"})
# =====================
# Schemas
# =====================
class WikiLoginRequest(BaseModel):
username: str
password: str
class WikiLoginResponse(BaseModel):
status: str
message: Optional[str] = None
class PageInfoResponse(BaseModel):
pageid: int
title: str
fullurl: str
class PageContentResponse(BaseModel):
pageid: int
title: str
wikitext: str
# =====================
# Helpers
# =====================
# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich)
_dash_variants = ("-", "", "")
def _normalize_titles(title: str):
yield title
# Leerzeichen → Unterstrich
if " " in title:
yield title.replace(" ", "_")
# Gedankenstriche ↔ Bindestrich
for dv in _dash_variants:
for dv2 in _dash_variants:
if dv != dv2 and dv in title:
yield title.replace(dv, dv2)
# NEW: Robustes Pageinfo (None wenn nicht gefunden)
def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]:
params = {
"action": "query",
"format": "json",
"prop": "info",
"inprop": "url",
"redirects": 1, # folgt Weiterleitungen
}
for candidate in _normalize_titles(title):
try:
r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10)
r.raise_for_status()
except Exception as e:
# Upstream gestört → 502 (aber nicht für nächste Candidate blockieren)
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
pages = r.json().get("query", {}).get("pages", {}) or {}
if not isinstance(pages, dict) or not pages:
continue
# MediaWiki liefert dict {pageid(str): {..}}
pid_str, page = next(iter(pages.items()))
# Missing?
if page.get("missing") is not None or str(pid_str) == "-1":
continue
title_out = page.get("title") or candidate
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
try:
pid = int(pid_str)
except ValueError:
pid = int(page.get("pageid", -1))
return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl)
return None
# =====================
# Endpoints
# =====================
@router.get("/health")
def health() -> Dict[str, str]:
try:
r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5)
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
return {"status": "ok"}
@router.post("/login", response_model=WikiLoginResponse)
def login(data: WikiLoginRequest):
# 1) Token holen
try:
tok = wiki_session.get(
WIKI_API_URL,
params={"action":"query","meta":"tokens","type":"login","format":"json"},
timeout=10,
)
tok.raise_for_status()
logintoken = tok.json().get("query", {}).get("tokens", {}).get("logintoken")
if not logintoken:
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
except Exception as e:
raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
# 2) Versuch: clientlogin (mit loginreturnurl!)
try:
cl = wiki_session.post(
WIKI_API_URL,
data={
"action": "clientlogin",
"format": "json",
"username": data.username,
"password": data.password,
"logintoken": logintoken,
"loginreturnurl": "https://example.org/" # notwendig bei manchen Setups
},
timeout=15,
)
cl.raise_for_status()
clj = cl.json().get("clientlogin", {})
if clj.get("status") == "PASS":
return WikiLoginResponse(status="success")
# Falls UI/FAIL/etc.: weiter mit Legacy
except Exception as e:
# nicht sofort fehlschlagen Legacy probieren
pass
# 3) Fallback: action=login (Legacy)
try:
lg = wiki_session.post(
WIKI_API_URL,
data={
"action": "login",
"format": "json",
"lgname": data.username,
"lgpassword": data.password,
"lgtoken": logintoken,
},
timeout=15,
)
lg.raise_for_status()
res = lg.json().get("login", {}).get("result")
if res == "Success":
return WikiLoginResponse(status="success")
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen (legacy): {res}")
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=502, detail=f"Login-Error (legacy): {e}")
@router.get("/semantic/pages")
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
smw_query = f"[[Category:{category}]]"
ask_query = f"{smw_query}|limit=50000"
try:
r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30)
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
return r.json().get("query", {}).get("results", {})
@router.get("/parsepage", response_model=PageContentResponse)
def parse_page(pageid: int = Query(...), title: str = Query(None)):
try:
r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20)
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
# CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln
@router.get("/info", response_model=PageInfoResponse)
def page_info(title: str = Query(..., description="Seitentitel")):
result = _fetch_pageinfo_by_title(title)
if not result:
# sauberes 404 statt StopIteration/500
raise HTTPException(status_code=404, detail=f"Page not found: {title}")
return result
# CHANGED: /semantic/page propagiert 404 sauber weiter
@router.get("/semantic/page")
def semantic_page(title: str = Query(...)) -> Dict[str, Any]:
# SMW-Printouts beschaffen
entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen
entry = entries.get(title)
if not entry:
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
# Pageinfo & Wikitext holen
info = page_info(title=title) # gibt 404 wenn unbekannt
parsed = parse_page(pageid=info.pageid, title=title)
return {
"title": title,
"pageid": info.pageid,
"fullurl": info.fullurl,
"printouts": entry.get("printouts", {}),
"wikitext": parsed.wikitext,
}