llm-api/wiki_router.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
This commit is contained in:
parent
8302a7fecf
commit
a02008ec17
|
|
@ -7,56 +7,115 @@ Beschreibung:
|
||||||
* /login: Führt clientlogin durch und speichert Session-Cookies.
|
* /login: Führt clientlogin durch und speichert Session-Cookies.
|
||||||
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
|
* /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask.
|
||||||
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
|
* /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab.
|
||||||
* /info: Liefert pageid und fullurl über Core-API Query.
|
* /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling
|
||||||
* /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API.
|
* /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404
|
||||||
Version: 1.2.0
|
Version: 1.3.0
|
||||||
"""
|
"""
|
||||||
from dotenv import load_dotenv
|
|
||||||
load_dotenv()
|
# HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT.
|
||||||
|
# Markierungen: # NEW / # CHANGED
|
||||||
|
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
from fastapi import APIRouter, HTTPException, Query
|
from fastapi import APIRouter, HTTPException, Query
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import Dict, Any, List
|
import os
|
||||||
import requests, os
|
import requests
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
__version__ = "1.2.0"
|
load_dotenv()
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php")
|
router = APIRouter(prefix="/import/wiki", tags=["wiki"])
|
||||||
|
|
||||||
|
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php")
|
||||||
|
|
||||||
|
# Session für Cookies (Login)
|
||||||
wiki_session = requests.Session()
|
wiki_session = requests.Session()
|
||||||
|
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"})
|
||||||
|
|
||||||
|
# =====================
|
||||||
|
# Schemas
|
||||||
|
# =====================
|
||||||
class WikiLoginRequest(BaseModel):
|
class WikiLoginRequest(BaseModel):
|
||||||
username: str
|
username: str
|
||||||
password: str
|
password: str
|
||||||
|
|
||||||
class WikiLoginResponse(BaseModel):
|
class WikiLoginResponse(BaseModel):
|
||||||
status: str
|
status: str
|
||||||
message: str | None = None
|
message: Optional[str] = None
|
||||||
|
|
||||||
class PageContentResponse(BaseModel):
|
|
||||||
pageid: int
|
|
||||||
title: str
|
|
||||||
wikitext: str
|
|
||||||
|
|
||||||
class PageInfoResponse(BaseModel):
|
class PageInfoResponse(BaseModel):
|
||||||
pageid: int
|
pageid: int
|
||||||
title: str
|
title: str
|
||||||
fullurl: str
|
fullurl: str
|
||||||
|
|
||||||
# Health-Check
|
class PageContentResponse(BaseModel):
|
||||||
@router.get("/health")
|
pageid: int
|
||||||
def health_check():
|
title: str
|
||||||
|
wikitext: str
|
||||||
|
|
||||||
|
# =====================
|
||||||
|
# Helpers
|
||||||
|
# =====================
|
||||||
|
# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich)
|
||||||
|
_dash_variants = ("-", "–", "—")
|
||||||
|
|
||||||
|
def _normalize_titles(title: str):
|
||||||
|
yield title
|
||||||
|
# Leerzeichen → Unterstrich
|
||||||
|
if " " in title:
|
||||||
|
yield title.replace(" ", "_")
|
||||||
|
# Gedankenstriche ↔ Bindestrich
|
||||||
|
for dv in _dash_variants:
|
||||||
|
for dv2 in _dash_variants:
|
||||||
|
if dv != dv2 and dv in title:
|
||||||
|
yield title.replace(dv, dv2)
|
||||||
|
|
||||||
|
# NEW: Robustes Pageinfo (None wenn nicht gefunden)
|
||||||
|
def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]:
|
||||||
|
params = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"prop": "info",
|
||||||
|
"inprop": "url",
|
||||||
|
"redirects": 1, # folgt Weiterleitungen
|
||||||
|
}
|
||||||
|
for candidate in _normalize_titles(title):
|
||||||
try:
|
try:
|
||||||
resp = wiki_session.get(
|
r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10)
|
||||||
WIKI_API_URL,
|
r.raise_for_status()
|
||||||
params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"},
|
except Exception as e:
|
||||||
timeout=5
|
# Upstream gestört → 502 (aber nicht für nächste Candidate blockieren)
|
||||||
)
|
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
|
||||||
resp.raise_for_status()
|
pages = r.json().get("query", {}).get("pages", {}) or {}
|
||||||
|
if not isinstance(pages, dict) or not pages:
|
||||||
|
continue
|
||||||
|
# MediaWiki liefert dict {pageid(str): {..}}
|
||||||
|
pid_str, page = next(iter(pages.items()))
|
||||||
|
# Missing?
|
||||||
|
if page.get("missing") is not None or str(pid_str) == "-1":
|
||||||
|
continue
|
||||||
|
title_out = page.get("title") or candidate
|
||||||
|
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
|
||||||
|
try:
|
||||||
|
pid = int(pid_str)
|
||||||
|
except ValueError:
|
||||||
|
pid = int(page.get("pageid", -1))
|
||||||
|
return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# =====================
|
||||||
|
# Endpoints
|
||||||
|
# =====================
|
||||||
|
@router.get("/health")
|
||||||
|
def health() -> Dict[str, str]:
|
||||||
|
try:
|
||||||
|
r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5)
|
||||||
|
r.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
|
raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}")
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
# Login Endpoint
|
|
||||||
@router.post("/login", response_model=WikiLoginResponse)
|
@router.post("/login", response_model=WikiLoginResponse)
|
||||||
def login(data: WikiLoginRequest):
|
def login(data: WikiLoginRequest):
|
||||||
# Token holen
|
# Token holen
|
||||||
|
|
@ -64,7 +123,7 @@ def login(data: WikiLoginRequest):
|
||||||
token_resp = wiki_session.get(
|
token_resp = wiki_session.get(
|
||||||
WIKI_API_URL,
|
WIKI_API_URL,
|
||||||
params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
|
params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
|
||||||
timeout=10
|
timeout=10,
|
||||||
)
|
)
|
||||||
token_resp.raise_for_status()
|
token_resp.raise_for_status()
|
||||||
token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken")
|
token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken")
|
||||||
|
|
@ -72,6 +131,7 @@ def login(data: WikiLoginRequest):
|
||||||
raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
|
raise HTTPException(status_code=502, detail=f"Token-Error: {e}")
|
||||||
if not token:
|
if not token:
|
||||||
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
|
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
|
||||||
|
|
||||||
# clientlogin
|
# clientlogin
|
||||||
try:
|
try:
|
||||||
login_resp = wiki_session.post(
|
login_resp = wiki_session.post(
|
||||||
|
|
@ -82,92 +142,64 @@ def login(data: WikiLoginRequest):
|
||||||
"username": data.username,
|
"username": data.username,
|
||||||
"password": data.password,
|
"password": data.password,
|
||||||
"logintoken": token,
|
"logintoken": token,
|
||||||
"loginreturnurl": "http://localhost:8000"
|
|
||||||
},
|
},
|
||||||
timeout=10
|
timeout=15,
|
||||||
)
|
)
|
||||||
login_resp.raise_for_status()
|
login_resp.raise_for_status()
|
||||||
status = login_resp.json().get("clientlogin", {}).get("status")
|
status = login_resp.json().get("clientlogin", {}).get("status")
|
||||||
except Exception:
|
|
||||||
status = None
|
|
||||||
# fallback login
|
|
||||||
if status != "PASS":
|
if status != "PASS":
|
||||||
alt = wiki_session.post(
|
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {status}")
|
||||||
WIKI_API_URL,
|
except HTTPException:
|
||||||
data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password},
|
raise
|
||||||
timeout=10
|
except Exception as e:
|
||||||
)
|
raise HTTPException(status_code=502, detail=f"Login-Error: {e}")
|
||||||
alt.raise_for_status()
|
return WikiLoginResponse(status="success")
|
||||||
status = alt.json().get("login", {}).get("result")
|
|
||||||
if status in ("PASS", "Success"):
|
|
||||||
return WikiLoginResponse(status="success", message=None)
|
|
||||||
return WikiLoginResponse(status="failed", message="Login fehlgeschlagen")
|
|
||||||
|
|
||||||
# SMW-Ask: alle Übungen inkl. Unterkategorien
|
|
||||||
@router.get("/semantic/pages")
|
@router.get("/semantic/pages")
|
||||||
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
|
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
|
||||||
smw_query = f"[[Category:{category}]]"
|
smw_query = f"[[Category:{category}]]"
|
||||||
ask_query = f"{smw_query}|limit=50000"
|
ask_query = f"{smw_query}|limit=50000"
|
||||||
r = wiki_session.get(
|
|
||||||
WIKI_API_URL,
|
|
||||||
params={"action": "ask", "query": ask_query, "format": "json"},
|
|
||||||
timeout=30
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
|
r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
|
raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}")
|
||||||
return r.json().get("query", {}).get("results", {})
|
return r.json().get("query", {}).get("results", {})
|
||||||
|
|
||||||
# Wikitext über parse-Endpoint holen (per pageid)
|
|
||||||
@router.get("/parsepage", response_model=PageContentResponse)
|
@router.get("/parsepage", response_model=PageContentResponse)
|
||||||
def parse_page(pageid: int = Query(...), title: str = Query(None)):
|
def parse_page(pageid: int = Query(...), title: str = Query(None)):
|
||||||
r = wiki_session.get(
|
|
||||||
WIKI_API_URL,
|
|
||||||
params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"},
|
|
||||||
timeout=20
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
|
r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
|
raise HTTPException(status_code=502, detail=f"Parse-Error: {e}")
|
||||||
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
|
wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "")
|
||||||
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
|
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
|
||||||
|
|
||||||
# Pageinfo über Core-API (ermittelt pageid + fullurl)
|
# CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln
|
||||||
@router.get("/info", response_model=PageInfoResponse)
|
@router.get("/info", response_model=PageInfoResponse)
|
||||||
def page_info(title: str = Query(..., description="Name der Seite")):
|
def page_info(title: str = Query(..., description="Seitentitel")):
|
||||||
r = wiki_session.get(
|
result = _fetch_pageinfo_by_title(title)
|
||||||
WIKI_API_URL,
|
if not result:
|
||||||
params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"},
|
# sauberes 404 statt StopIteration/500
|
||||||
timeout=10
|
raise HTTPException(status_code=404, detail=f"Page not found: {title}")
|
||||||
)
|
return result
|
||||||
try:
|
|
||||||
r.raise_for_status()
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(status_code=502, detail=f"Info-Error: {e}")
|
|
||||||
pages = r.json().get("query", {}).get("pages", {})
|
|
||||||
pid_str, page = next(iter(pages.items()))
|
|
||||||
pid = int(pid_str)
|
|
||||||
fullurl = page.get("fullurl")
|
|
||||||
return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl)
|
|
||||||
|
|
||||||
# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API
|
# CHANGED: /semantic/page propagiert 404 sauber weiter
|
||||||
@router.get("/semantic/page", response_model=Dict[str, Any])
|
@router.get("/semantic/page")
|
||||||
def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]:
|
def semantic_page(title: str = Query(...)) -> Dict[str, Any]:
|
||||||
# Metadaten aus SMW-Ask
|
# SMW-Printouts beschaffen
|
||||||
entries = semantic_pages(category)
|
entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen
|
||||||
entry = entries.get(title)
|
entry = entries.get(title)
|
||||||
if not entry:
|
if not entry:
|
||||||
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
|
raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.")
|
||||||
# Pageinfo via Core-API
|
# Pageinfo & Wikitext holen
|
||||||
info = page_info(title=title)
|
info = page_info(title=title) # gibt 404 wenn unbekannt
|
||||||
# Wikitext via parse
|
|
||||||
parsed = parse_page(pageid=info.pageid, title=title)
|
parsed = parse_page(pageid=info.pageid, title=title)
|
||||||
return {
|
return {
|
||||||
"title": title,
|
"title": title,
|
||||||
"pageid": info.pageid,
|
"pageid": info.pageid,
|
||||||
"fullurl": info.fullurl,
|
"fullurl": info.fullurl,
|
||||||
"printouts": entry.get("printouts", {}),
|
"printouts": entry.get("printouts", {}),
|
||||||
"wikitext": parsed.wikitext
|
"wikitext": parsed.wikitext,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user