llm-api/wiki_router.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
This commit is contained in:
parent
8302a7fecf
commit
a02008ec17
|
|
@ -7,56 +7,115 @@ Beschreibung:
|
|||
* /login: Führt clientlogin durch und speichert Session-Cookies.
|
||||
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
|
||||
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
|
||||
* /info: Liefert pageid und fullurl über Core-API Query.
|
||||
* /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API.
|
||||
Version: 1.2.0
|
||||
* /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling
|
||||
* /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404
|
||||
Version: 1.3.0
|
||||
"""
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
# HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT.
|
||||
# Markierungen: # NEW / # CHANGED
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
from typing import Dict, Any, List
|
||||
import requests, os
|
||||
import os
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
__version__ = "1.2.0"
|
||||
router = APIRouter()
|
||||
load_dotenv()
|
||||
|
||||
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php")
|
||||
router = APIRouter(prefix="/import/wiki", tags=["wiki"])
|
||||
|
||||
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php")
|
||||
|
||||
# Session für Cookies (Login)
|
||||
wiki_session = requests.Session()
|
||||
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"})
|
||||
|
||||
# =====================
|
||||
# Schemas
|
||||
# =====================
|
||||
class WikiLoginRequest(BaseModel):
|
||||
username: str
|
||||
password: str
|
||||
|
||||
class WikiLoginResponse(BaseModel):
|
||||
status: str
|
||||
message: str | None = None
|
||||
|
||||
class PageContentResponse(BaseModel):
|
||||
pageid: int
|
||||
title: str
|
||||
wikitext: str
|
||||
message: Optional[str] = None
|
||||
|
||||
class PageInfoResponse(BaseModel):
|
||||
pageid: int
|
||||
title: str
|
||||
fullurl: str
|
||||
|
||||
# Health-Check
|
||||
@router.get("/health")
|
||||
def health_check():
|
||||
class PageContentResponse(BaseModel):
|
||||
pageid: int
|
||||
title: str
|
||||
wikitext: str
|
||||
|
||||
# =====================
|
||||
# Helpers
|
||||
# =====================
|
||||
# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich)
|
||||
_dash_variants = ("-", "–", "—")
|
||||
|
||||
def _normalize_titles(title: str):
|
||||
yield title
|
||||
# Leerzeichen → Unterstrich
|
||||
if " " in title:
|
||||
yield title.replace(" ", "_")
|
||||
# Gedankenstriche ↔ Bindestrich
|
||||
for dv in _dash_variants:
|
||||
for dv2 in _dash_variants:
|
||||
if dv != dv2 and dv in title:
|
||||
yield title.replace(dv, dv2)
|
||||
|
||||
# NEW: Robustes Pageinfo (None wenn nicht gefunden)
|
||||
def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]:
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"prop": "info",
|
||||
"inprop": "url",
|
||||
"redirects": 1, # folgt Weiterleitungen
|
||||
}
|
||||
for candidate in _normalize_titles(title):
|
||||
try:
|
||||
resp = wiki_session.get(
|
||||
WIKI_API_URL,
|
||||
params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"},
|
||||
timeout=5
|
||||
)
|
||||
resp.raise_for_status()
|
||||
r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
# Upstream gestört → 502 (aber nicht für nächste Candidate blockieren)
|
||||
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
|
||||
pages = r.json().get("query", {}).get("pages", {}) or {}
|
||||
if not isinstance(pages, dict) or not pages:
|
||||
continue
|
||||
# MediaWiki liefert dict {pageid(str): {..}}
|
||||
pid_str, page = next(iter(pages.items()))
|
||||
# Missing?
|
||||
if page.get("missing") is not None or str(pid_str) == "-1":
|
||||
continue
|
||||
title_out = page.get("title") or candidate
|
||||
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
|
||||
try:
|
||||
pid = int(pid_str)
|
||||
except ValueError:
|
||||
pid = int(page.get("pageid", -1))
|
||||
return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl)
|
||||
return None
|
||||
|
||||
# =====================
|
||||
# Endpoints
|
||||
# =====================
|
||||
@router.get("/health")
|
||||
def health() -> Dict[str, str]:
|
||||
try:
|
||||
r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
|
||||
return {"status": "ok"}
|
||||
|
||||
# Login Endpoint
|
||||
@router.post("/login", response_model=WikiLoginResponse)
|
||||
def login(data: WikiLoginRequest):
|
||||
# Token holen
|
||||
|
|
@ -64,7 +123,7 @@ def login(data: WikiLoginRequest):
|
|||
token_resp = wiki_session.get(
|
||||
WIKI_API_URL,
|
||||
params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
|
||||
timeout=10
|
||||
timeout=10,
|
||||
)
|
||||
token_resp.raise_for_status()
|
||||
token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken")
|
||||
|
|
@ -72,6 +131,7 @@ def login(data: WikiLoginRequest):
|
|||
raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
|
||||
if not token:
|
||||
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
|
||||
|
||||
# clientlogin
|
||||
try:
|
||||
login_resp = wiki_session.post(
|
||||
|
|
@ -82,92 +142,64 @@ def login(data: WikiLoginRequest):
|
|||
"username": data.username,
|
||||
"password": data.password,
|
||||
"logintoken": token,
|
||||
"loginreturnurl": "http://localhost:8000"
|
||||
},
|
||||
timeout=10
|
||||
timeout=15,
|
||||
)
|
||||
login_resp.raise_for_status()
|
||||
status = login_resp.json().get("clientlogin", {}).get("status")
|
||||
except Exception:
|
||||
status = None
|
||||
# fallback login
|
||||
if status != "PASS":
|
||||
alt = wiki_session.post(
|
||||
WIKI_API_URL,
|
||||
data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password},
|
||||
timeout=10
|
||||
)
|
||||
alt.raise_for_status()
|
||||
status = alt.json().get("login", {}).get("result")
|
||||
if status in ("PASS", "Success"):
|
||||
return WikiLoginResponse(status="success", message=None)
|
||||
return WikiLoginResponse(status="failed", message="Login fehlgeschlagen")
|
||||
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {status}")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=502, detail=f"Login-Error: {e}")
|
||||
return WikiLoginResponse(status="success")
|
||||
|
||||
# SMW-Ask: alle Übungen inkl. Unterkategorien
|
||||
@router.get("/semantic/pages")
|
||||
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
|
||||
smw_query = f"[[Category:{category}]]"
|
||||
ask_query = f"{smw_query}|limit=50000"
|
||||
r = wiki_session.get(
|
||||
WIKI_API_URL,
|
||||
params={"action": "ask", "query": ask_query, "format": "json"},
|
||||
timeout=30
|
||||
)
|
||||
try:
|
||||
r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
|
||||
return r.json().get("query", {}).get("results", {})
|
||||
|
||||
# Wikitext über parse-Endpoint holen (per pageid)
|
||||
@router.get("/parsepage", response_model=PageContentResponse)
|
||||
def parse_page(pageid: int = Query(...), title: str = Query(None)):
|
||||
r = wiki_session.get(
|
||||
WIKI_API_URL,
|
||||
params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"},
|
||||
timeout=20
|
||||
)
|
||||
try:
|
||||
r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
|
||||
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
|
||||
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
|
||||
|
||||
# Pageinfo über Core-API (ermittelt pageid + fullurl)
|
||||
# CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln
|
||||
@router.get("/info", response_model=PageInfoResponse)
|
||||
def page_info(title: str = Query(..., description="Name der Seite")):
|
||||
r = wiki_session.get(
|
||||
WIKI_API_URL,
|
||||
params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"},
|
||||
timeout=10
|
||||
)
|
||||
try:
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
|
||||
pages = r.json().get("query", {}).get("pages", {})
|
||||
pid_str, page = next(iter(pages.items()))
|
||||
pid = int(pid_str)
|
||||
fullurl = page.get("fullurl")
|
||||
return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl)
|
||||
def page_info(title: str = Query(..., description="Seitentitel")):
|
||||
result = _fetch_pageinfo_by_title(title)
|
||||
if not result:
|
||||
# sauberes 404 statt StopIteration/500
|
||||
raise HTTPException(status_code=404, detail=f"Page not found: {title}")
|
||||
return result
|
||||
|
||||
# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API
|
||||
@router.get("/semantic/page", response_model=Dict[str, Any])
|
||||
def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]:
|
||||
# Metadaten aus SMW-Ask
|
||||
entries = semantic_pages(category)
|
||||
# CHANGED: /semantic/page propagiert 404 sauber weiter
|
||||
@router.get("/semantic/page")
|
||||
def semantic_page(title: str = Query(...)) -> Dict[str, Any]:
|
||||
# SMW-Printouts beschaffen
|
||||
entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen
|
||||
entry = entries.get(title)
|
||||
if not entry:
|
||||
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
|
||||
# Pageinfo via Core-API
|
||||
info = page_info(title=title)
|
||||
# Wikitext via parse
|
||||
# Pageinfo & Wikitext holen
|
||||
info = page_info(title=title) # gibt 404 wenn unbekannt
|
||||
parsed = parse_page(pageid=info.pageid, title=title)
|
||||
return {
|
||||
"title": title,
|
||||
"pageid": info.pageid,
|
||||
"fullurl": info.fullurl,
|
||||
"printouts": entry.get("printouts", {}),
|
||||
"wikitext": parsed.wikitext
|
||||
"wikitext": parsed.wikitext,
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user