llm-api/wiki_router.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-08-11 06:58:20 +02:00
parent 8302a7fecf
commit a02008ec17

View File

@ -7,56 +7,115 @@ Beschreibung:
* /login: Führt clientlogin durch und speichert Session-Cookies. * /login: Führt clientlogin durch und speichert Session-Cookies.
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask. * /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab. * /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
* /info: Liefert pageid und fullurl über Core-API Query. * /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling
* /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API. * /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404
Version: 1.2.0 Version: 1.3.0
""" """
from dotenv import load_dotenv
load_dotenv() # HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT.
# Markierungen: # NEW / # CHANGED
from typing import Dict, Any, Optional
from dataclasses import dataclass
from fastapi import APIRouter, HTTPException, Query from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel from pydantic import BaseModel
from typing import Dict, Any, List import os
import requests, os import requests
from dotenv import load_dotenv
__version__ = "1.2.0" load_dotenv()
router = APIRouter()
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") router = APIRouter(prefix="/import/wiki", tags=["wiki"])
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php")
# Session für Cookies (Login)
wiki_session = requests.Session() wiki_session = requests.Session()
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"})
# =====================
# Schemas
# =====================
class WikiLoginRequest(BaseModel): class WikiLoginRequest(BaseModel):
username: str username: str
password: str password: str
class WikiLoginResponse(BaseModel): class WikiLoginResponse(BaseModel):
status: str status: str
message: str | None = None message: Optional[str] = None
class PageContentResponse(BaseModel):
pageid: int
title: str
wikitext: str
class PageInfoResponse(BaseModel): class PageInfoResponse(BaseModel):
pageid: int pageid: int
title: str title: str
fullurl: str fullurl: str
# Health-Check class PageContentResponse(BaseModel):
@router.get("/health") pageid: int
def health_check(): title: str
wikitext: str
# =====================
# Helpers
# =====================
# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich)
_dash_variants = ("-", "", "")
def _normalize_titles(title: str):
yield title
# Leerzeichen → Unterstrich
if " " in title:
yield title.replace(" ", "_")
# Gedankenstriche ↔ Bindestrich
for dv in _dash_variants:
for dv2 in _dash_variants:
if dv != dv2 and dv in title:
yield title.replace(dv, dv2)
# NEW: Robustes Pageinfo (None wenn nicht gefunden)
def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]:
params = {
"action": "query",
"format": "json",
"prop": "info",
"inprop": "url",
"redirects": 1, # folgt Weiterleitungen
}
for candidate in _normalize_titles(title):
try: try:
resp = wiki_session.get( r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10)
WIKI_API_URL, r.raise_for_status()
params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, except Exception as e:
timeout=5 # Upstream gestört → 502 (aber nicht für nächste Candidate blockieren)
) raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
resp.raise_for_status() pages = r.json().get("query", {}).get("pages", {}) or {}
if not isinstance(pages, dict) or not pages:
continue
# MediaWiki liefert dict {pageid(str): {..}}
pid_str, page = next(iter(pages.items()))
# Missing?
if page.get("missing") is not None or str(pid_str) == "-1":
continue
title_out = page.get("title") or candidate
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
try:
pid = int(pid_str)
except ValueError:
pid = int(page.get("pageid", -1))
return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl)
return None
# =====================
# Endpoints
# =====================
@router.get("/health")
def health() -> Dict[str, str]:
try:
r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5)
r.raise_for_status()
except Exception as e: except Exception as e:
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
return {"status": "ok"} return {"status": "ok"}
# Login Endpoint
@router.post("/login", response_model=WikiLoginResponse) @router.post("/login", response_model=WikiLoginResponse)
def login(data: WikiLoginRequest): def login(data: WikiLoginRequest):
# Token holen # Token holen
@ -64,7 +123,7 @@ def login(data: WikiLoginRequest):
token_resp = wiki_session.get( token_resp = wiki_session.get(
WIKI_API_URL, WIKI_API_URL,
params={"action": "query", "meta": "tokens", "type": "login", "format": "json"}, params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
timeout=10 timeout=10,
) )
token_resp.raise_for_status() token_resp.raise_for_status()
token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken") token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken")
@ -72,6 +131,7 @@ def login(data: WikiLoginRequest):
raise HTTPException(status_code=502, detail=f"Token-Error: {e}") raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
if not token: if not token:
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
# clientlogin # clientlogin
try: try:
login_resp = wiki_session.post( login_resp = wiki_session.post(
@ -82,92 +142,64 @@ def login(data: WikiLoginRequest):
"username": data.username, "username": data.username,
"password": data.password, "password": data.password,
"logintoken": token, "logintoken": token,
"loginreturnurl": "http://localhost:8000"
}, },
timeout=10 timeout=15,
) )
login_resp.raise_for_status() login_resp.raise_for_status()
status = login_resp.json().get("clientlogin", {}).get("status") status = login_resp.json().get("clientlogin", {}).get("status")
except Exception:
status = None
# fallback login
if status != "PASS": if status != "PASS":
alt = wiki_session.post( raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {status}")
WIKI_API_URL, except HTTPException:
data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password}, raise
timeout=10 except Exception as e:
) raise HTTPException(status_code=502, detail=f"Login-Error: {e}")
alt.raise_for_status() return WikiLoginResponse(status="success")
status = alt.json().get("login", {}).get("result")
if status in ("PASS", "Success"):
return WikiLoginResponse(status="success", message=None)
return WikiLoginResponse(status="failed", message="Login fehlgeschlagen")
# SMW-Ask: alle Übungen inkl. Unterkategorien
@router.get("/semantic/pages") @router.get("/semantic/pages")
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]: def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
smw_query = f"[[Category:{category}]]" smw_query = f"[[Category:{category}]]"
ask_query = f"{smw_query}|limit=50000" ask_query = f"{smw_query}|limit=50000"
r = wiki_session.get(
WIKI_API_URL,
params={"action": "ask", "query": ask_query, "format": "json"},
timeout=30
)
try: try:
r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30)
r.raise_for_status() r.raise_for_status()
except Exception as e: except Exception as e:
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
return r.json().get("query", {}).get("results", {}) return r.json().get("query", {}).get("results", {})
# Wikitext über parse-Endpoint holen (per pageid)
@router.get("/parsepage", response_model=PageContentResponse) @router.get("/parsepage", response_model=PageContentResponse)
def parse_page(pageid: int = Query(...), title: str = Query(None)): def parse_page(pageid: int = Query(...), title: str = Query(None)):
r = wiki_session.get(
WIKI_API_URL,
params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"},
timeout=20
)
try: try:
r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20)
r.raise_for_status() r.raise_for_status()
except Exception as e: except Exception as e:
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}") raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "") wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext) return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
# Pageinfo über Core-API (ermittelt pageid + fullurl) # CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln
@router.get("/info", response_model=PageInfoResponse) @router.get("/info", response_model=PageInfoResponse)
def page_info(title: str = Query(..., description="Name der Seite")): def page_info(title: str = Query(..., description="Seitentitel")):
r = wiki_session.get( result = _fetch_pageinfo_by_title(title)
WIKI_API_URL, if not result:
params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"}, # sauberes 404 statt StopIteration/500
timeout=10 raise HTTPException(status_code=404, detail=f"Page not found: {title}")
) return result
try:
r.raise_for_status()
except Exception as e:
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
pages = r.json().get("query", {}).get("pages", {})
pid_str, page = next(iter(pages.items()))
pid = int(pid_str)
fullurl = page.get("fullurl")
return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl)
# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API # CHANGED: /semantic/page propagiert 404 sauber weiter
@router.get("/semantic/page", response_model=Dict[str, Any]) @router.get("/semantic/page")
def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]: def semantic_page(title: str = Query(...)) -> Dict[str, Any]:
# Metadaten aus SMW-Ask # SMW-Printouts beschaffen
entries = semantic_pages(category) entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen
entry = entries.get(title) entry = entries.get(title)
if not entry: if not entry:
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.") raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
# Pageinfo via Core-API # Pageinfo & Wikitext holen
info = page_info(title=title) info = page_info(title=title) # gibt 404 wenn unbekannt
# Wikitext via parse
parsed = parse_page(pageid=info.pageid, title=title) parsed = parse_page(pageid=info.pageid, title=title)
return { return {
"title": title, "title": title,
"pageid": info.pageid, "pageid": info.pageid,
"fullurl": info.fullurl, "fullurl": info.fullurl,
"printouts": entry.get("printouts", {}), "printouts": entry.get("printouts", {}),
"wikitext": parsed.wikitext "wikitext": parsed.wikitext,
} }