llm-api/wiki_router.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-08-11 06:58:20 +02:00
parent 8302a7fecf
commit a02008ec17

View File

@ -7,56 +7,115 @@ Beschreibung:
* /login: Führt clientlogin durch und speichert Session-Cookies.
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
* /info: Liefert pageid und fullurl über Core-API Query.
* /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API.
Version: 1.2.0
* /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling
* /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404
Version: 1.3.0
"""
from dotenv import load_dotenv
load_dotenv()
# HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT.
# Markierungen: # NEW / # CHANGED
from typing import Dict, Any, Optional
from dataclasses import dataclass
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from typing import Dict, Any, List
import requests, os
import os
import requests
from dotenv import load_dotenv
__version__ = "1.2.0"
router = APIRouter()
load_dotenv()
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php")
router = APIRouter(prefix="/import/wiki", tags=["wiki"])
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php")
# Session für Cookies (Login)
wiki_session = requests.Session()
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"})
# =====================
# Schemas
# =====================
class WikiLoginRequest(BaseModel):
username: str
password: str
class WikiLoginResponse(BaseModel):
status: str
message: str | None = None
class PageContentResponse(BaseModel):
pageid: int
title: str
wikitext: str
message: Optional[str] = None
class PageInfoResponse(BaseModel):
pageid: int
title: str
fullurl: str
# Health-Check
class PageContentResponse(BaseModel):
pageid: int
title: str
wikitext: str
# =====================
# Helpers
# =====================
# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich)
_dash_variants = ("-", "", "")
def _normalize_titles(title: str):
yield title
# Leerzeichen → Unterstrich
if " " in title:
yield title.replace(" ", "_")
# Gedankenstriche ↔ Bindestrich
for dv in _dash_variants:
for dv2 in _dash_variants:
if dv != dv2 and dv in title:
yield title.replace(dv, dv2)
# NEW: Robustes Pageinfo (None wenn nicht gefunden)
def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]:
params = {
"action": "query",
"format": "json",
"prop": "info",
"inprop": "url",
"redirects": 1, # folgt Weiterleitungen
}
for candidate in _normalize_titles(title):
try:
r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10)
r.raise_for_status()
except Exception as e:
# Upstream gestört → 502 (aber nicht für nächste Candidate blockieren)
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
pages = r.json().get("query", {}).get("pages", {}) or {}
if not isinstance(pages, dict) or not pages:
continue
# MediaWiki liefert dict {pageid(str): {..}}
pid_str, page = next(iter(pages.items()))
# Missing?
if page.get("missing") is not None or str(pid_str) == "-1":
continue
title_out = page.get("title") or candidate
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
try:
pid = int(pid_str)
except ValueError:
pid = int(page.get("pageid", -1))
return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl)
return None
# =====================
# Endpoints
# =====================
@router.get("/health")
def health_check():
def health() -> Dict[str, str]:
try:
resp = wiki_session.get(
WIKI_API_URL,
params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"},
timeout=5
)
resp.raise_for_status()
r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5)
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
return {"status": "ok"}
# Login Endpoint
@router.post("/login", response_model=WikiLoginResponse)
def login(data: WikiLoginRequest):
# Token holen
@ -64,7 +123,7 @@ def login(data: WikiLoginRequest):
token_resp = wiki_session.get(
WIKI_API_URL,
params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
timeout=10
timeout=10,
)
token_resp.raise_for_status()
token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken")
@ -72,6 +131,7 @@ def login(data: WikiLoginRequest):
raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
if not token:
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
# clientlogin
try:
login_resp = wiki_session.post(
@ -82,92 +142,64 @@ def login(data: WikiLoginRequest):
"username": data.username,
"password": data.password,
"logintoken": token,
"loginreturnurl": "http://localhost:8000"
},
timeout=10
timeout=15,
)
login_resp.raise_for_status()
status = login_resp.json().get("clientlogin", {}).get("status")
except Exception:
status = None
# fallback login
if status != "PASS":
alt = wiki_session.post(
WIKI_API_URL,
data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password},
timeout=10
)
alt.raise_for_status()
status = alt.json().get("login", {}).get("result")
if status in ("PASS", "Success"):
return WikiLoginResponse(status="success", message=None)
return WikiLoginResponse(status="failed", message="Login fehlgeschlagen")
if status != "PASS":
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {status}")
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=502, detail=f"Login-Error: {e}")
return WikiLoginResponse(status="success")
# SMW-Ask: alle Übungen inkl. Unterkategorien
@router.get("/semantic/pages")
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
smw_query = f"[[Category:{category}]]"
ask_query = f"{smw_query}|limit=50000"
r = wiki_session.get(
WIKI_API_URL,
params={"action": "ask", "query": ask_query, "format": "json"},
timeout=30
)
try:
r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30)
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
return r.json().get("query", {}).get("results", {})
# Wikitext über parse-Endpoint holen (per pageid)
@router.get("/parsepage", response_model=PageContentResponse)
def parse_page(pageid: int = Query(...), title: str = Query(None)):
r = wiki_session.get(
WIKI_API_URL,
params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"},
timeout=20
)
try:
r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20)
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
# Pageinfo über Core-API (ermittelt pageid + fullurl)
# CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln
@router.get("/info", response_model=PageInfoResponse)
def page_info(title: str = Query(..., description="Name der Seite")):
r = wiki_session.get(
WIKI_API_URL,
params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"},
timeout=10
)
try:
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
pages = r.json().get("query", {}).get("pages", {})
pid_str, page = next(iter(pages.items()))
pid = int(pid_str)
fullurl = page.get("fullurl")
return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl)
def page_info(title: str = Query(..., description="Seitentitel")):
result = _fetch_pageinfo_by_title(title)
if not result:
# sauberes 404 statt StopIteration/500
raise HTTPException(status_code=404, detail=f"Page not found: {title}")
return result
# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API
@router.get("/semantic/page", response_model=Dict[str, Any])
def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]:
# Metadaten aus SMW-Ask
entries = semantic_pages(category)
# CHANGED: /semantic/page propagiert 404 sauber weiter
@router.get("/semantic/page")
def semantic_page(title: str = Query(...)) -> Dict[str, Any]:
# SMW-Printouts beschaffen
entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen
entry = entries.get(title)
if not entry:
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
# Pageinfo via Core-API
info = page_info(title=title)
# Wikitext via parse
# Pageinfo & Wikitext holen
info = page_info(title=title) # gibt 404 wenn unbekannt
parsed = parse_page(pageid=info.pageid, title=title)
return {
"title": title,
"pageid": info.pageid,
"fullurl": info.fullurl,
"printouts": entry.get("printouts", {}),
"wikitext": parsed.wikitext
"wikitext": parsed.wikitext,
}