All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
Neue Funktion zur Stabilisierung von PageID Yes—let’s lock this down properly. I’ve put a hardened wiki_router.py (v1.4.0) in the canvas. It: enriches /semantic/pages with pageid + fullurl for every title (batching + redirects + converttitles), makes /info tolerant (variants) and 404-safe, adds retry + light throttling to all MediaWiki calls, keeps the same routes and parameters (no breaking changes), logs coverage so we can see where things go sideways.
235 lines
8.7 KiB
Python
235 lines
8.7 KiB
Python
"""
|
||
wiki_router.py - v1.4.0 (stabil & nachvollziehbar)
|
||
|
||
Ziele:
|
||
- Keine API-Signaturänderungen (bestehende Routen bleiben)
|
||
- /semantic/pages reichert pageid/fullurl für ALLE Titel batchweise an (redirects=1, converttitles=1)
|
||
- /info robust: 404 statt 500, mit Titel-Varianten
|
||
- Wiederholungen & Throttling gegen MediaWiki
|
||
- Optionale Diagnose-Ausgaben und Coverage-Kennzahlen
|
||
|
||
Annahme: Der Router wird in main.py mit Prefix eingebunden:
|
||
app.include_router(wiki_router, prefix="/import/wiki")
|
||
|
||
Wenn ihr stattdessen den Prefix im Router setzen wollt, einfach in der APIRouter-Zeile unten
|
||
prefix="/import/wiki" ergänzen und in main.py OHNE prefix einbinden.
|
||
"""
|
||
|
||
from typing import Dict, Any, Optional, List, Tuple
|
||
from fastapi import APIRouter, HTTPException, Query, Request
|
||
from pydantic import BaseModel
|
||
import os, time, logging
|
||
import requests
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv()
|
||
|
||
logger = logging.getLogger("wiki_router")
|
||
logger.setLevel(logging.INFO)
|
||
|
||
router = APIRouter(tags=["wiki"]) # Prefix kommt aus main.py via include_router(..., prefix="/import/wiki")
|
||
|
||
# -------- Konfiguration --------
|
||
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php")
|
||
WIKI_TIMEOUT = float(os.getenv("WIKI_TIMEOUT", "15"))
|
||
WIKI_BATCH = int(os.getenv("WIKI_BATCH", "50"))
|
||
WIKI_RETRIES = int(os.getenv("WIKI_RETRIES", "1")) # zusätzliche Versuche bei Upstream-Fehlern
|
||
WIKI_SLEEPMS = int(os.getenv("WIKI_SLEEP_MS", "0")) # Throttle zwischen Requests
|
||
|
||
# Single Session (Cookies für Login)
|
||
wiki_session = requests.Session()
|
||
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.4"})
|
||
|
||
# -------- Schemas --------
|
||
class WikiLoginRequest(BaseModel):
|
||
username: str
|
||
password: str
|
||
|
||
class WikiLoginResponse(BaseModel):
|
||
status: str
|
||
message: Optional[str] = None
|
||
|
||
class PageInfoResponse(BaseModel):
|
||
pageid: int
|
||
title: str
|
||
fullurl: str
|
||
|
||
class PageContentResponse(BaseModel):
|
||
pageid: int
|
||
title: str
|
||
wikitext: str
|
||
|
||
# -------- Utils --------
|
||
|
||
def _sleep():
|
||
if WIKI_SLEEPMS > 0:
|
||
time.sleep(WIKI_SLEEPMS / 1000.0)
|
||
|
||
|
||
def _request_with_retry(method: str, params: Dict[str, Any], *, data: Dict[str, Any] | None = None) -> requests.Response:
|
||
last_exc: Optional[Exception] = None
|
||
for attempt in range(WIKI_RETRIES + 1):
|
||
try:
|
||
if method == "GET":
|
||
resp = wiki_session.get(WIKI_API_URL, params=params, timeout=WIKI_TIMEOUT)
|
||
else:
|
||
resp = wiki_session.post(WIKI_API_URL, data=data or params, timeout=WIKI_TIMEOUT)
|
||
resp.raise_for_status()
|
||
return resp
|
||
except Exception as e:
|
||
last_exc = e
|
||
logger.warning("Upstream error on %s (try %d/%d): %s", method, attempt + 1, WIKI_RETRIES + 1, e)
|
||
_sleep()
|
||
# alle Versuche erschöpft
|
||
raise HTTPException(status_code=502, detail=f"Upstream error: {last_exc}")
|
||
|
||
|
||
def _normalize_variants(title: str) -> List[str]:
|
||
t = (title or "").strip()
|
||
variants = {t}
|
||
if " " in t:
|
||
variants.add(t.replace(" ", "_"))
|
||
# Bindestrich / Gedankenstrich Varianten
|
||
for a, b in [("-", "–"), ("-", "—"), ("–", "-"), ("—", "-")]:
|
||
if a in t:
|
||
variants.add(t.replace(a, b))
|
||
return list(variants)
|
||
|
||
|
||
def _fetch_pageinfo_batch(titles: List[str]) -> Dict[str, Dict[str, Any]]:
|
||
if not titles:
|
||
return {}
|
||
out: Dict[str, Dict[str, Any]] = {}
|
||
for i in range(0, len(titles), max(1, WIKI_BATCH)):
|
||
chunk = titles[i:i + max(1, WIKI_BATCH)]
|
||
params = {
|
||
"action": "query",
|
||
"format": "json",
|
||
"prop": "info",
|
||
"inprop": "url",
|
||
"redirects": 1,
|
||
"converttitles": 1,
|
||
"titles": "|".join(chunk),
|
||
}
|
||
resp = _request_with_retry("GET", params)
|
||
data = resp.json() or {}
|
||
q = data.get("query", {})
|
||
redirects = {d.get("from"): d.get("to") for d in (q.get("redirects") or [])}
|
||
pages = q.get("pages", {}) or {}
|
||
for pid_str, page in pages.items():
|
||
if page.get("missing") is not None or str(pid_str) == "-1":
|
||
continue
|
||
try:
|
||
pid = int(pid_str)
|
||
except ValueError:
|
||
pid = int(page.get("pageid", -1))
|
||
title_out = page.get("title")
|
||
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
|
||
if not title_out:
|
||
continue
|
||
out[title_out] = {"pageid": pid, "fullurl": fullurl}
|
||
# auch Originaltitel der Redirects auflösen
|
||
for frm, to in redirects.items():
|
||
if to == title_out and frm not in out:
|
||
out[frm] = {"pageid": pid, "fullurl": fullurl}
|
||
_sleep()
|
||
return out
|
||
|
||
# -------- Endpoints --------
|
||
@router.get("/health")
|
||
def health(verbose: Optional[int] = Query(default=0)) -> Dict[str, Any]:
|
||
# einfacher Ping
|
||
resp = _request_with_retry("GET", {"action": "query", "meta": "siteinfo", "format": "json"})
|
||
if verbose:
|
||
info = resp.json().get("query", {}).get("general", {})
|
||
return {"status": "ok", "wiki": {"sitename": info.get("sitename"), "generator": info.get("generator")}}
|
||
return {"status": "ok"}
|
||
|
||
@router.post("/login", response_model=WikiLoginResponse)
|
||
def login(data: WikiLoginRequest):
|
||
# Token holen
|
||
tok = _request_with_retry("GET", {"action": "query", "meta": "tokens", "type": "login", "format": "json"})
|
||
token = tok.json().get("query", {}).get("tokens", {}).get("logintoken")
|
||
if not token:
|
||
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
|
||
|
||
# clientlogin (mit returnurl) + Fallback action=login
|
||
try:
|
||
cl = _request_with_retry("POST", {}, data={
|
||
"action": "clientlogin",
|
||
"format": "json",
|
||
"username": data.username,
|
||
"password": data.password,
|
||
"logintoken": token,
|
||
"loginreturnurl": "https://example.org/",
|
||
})
|
||
st = cl.json().get("clientlogin", {}).get("status")
|
||
if st == "PASS":
|
||
return WikiLoginResponse(status="success")
|
||
except HTTPException:
|
||
pass
|
||
|
||
lg = _request_with_retry("POST", {}, data={
|
||
"action": "login",
|
||
"format": "json",
|
||
"lgname": data.username,
|
||
"lgpassword": data.password,
|
||
"lgtoken": token,
|
||
})
|
||
res = lg.json().get("login", {}).get("result")
|
||
if res == "Success":
|
||
return WikiLoginResponse(status="success")
|
||
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {res}")
|
||
|
||
@router.get("/semantic/pages")
|
||
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
|
||
# Rohdaten aus SMW (Ask)
|
||
ask_query = f"[[Category:{category}]]|limit=50000"
|
||
r = _request_with_retry("GET", {"action": "ask", "query": ask_query, "format": "json"})
|
||
results = r.json().get("query", {}).get("results", {}) or {}
|
||
titles = list(results.keys())
|
||
|
||
# Batch-Anreicherung mit pageid/fullurl für ALLE Titel
|
||
info_map = _fetch_pageinfo_batch(titles)
|
||
|
||
enriched: Dict[str, Any] = {}
|
||
missing = 0
|
||
for title, entry in results.items():
|
||
base = entry if isinstance(entry, dict) else {}
|
||
extra = info_map.get(title, {})
|
||
if not extra:
|
||
missing += 1
|
||
enriched[title] = {
|
||
**base,
|
||
"pageid": extra.get("pageid", base.get("pageid")),
|
||
"fullurl": extra.get("fullurl", base.get("fullurl")),
|
||
}
|
||
logger.info("/semantic/pages: %d Titel, %d ohne pageid nach Enrichment", len(results), missing)
|
||
return enriched
|
||
|
||
@router.get("/parsepage", response_model=PageContentResponse)
|
||
def parse_page(pageid: int = Query(...), title: str = Query(None)):
|
||
resp = _request_with_retry("GET", {"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"})
|
||
wikitext = resp.json().get("parse", {}).get("wikitext", {}).get("*", "")
|
||
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
|
||
|
||
@router.get("/info", response_model=PageInfoResponse)
|
||
def page_info(title: str = Query(..., description="Seitentitel"), request: Request | None = None):
|
||
# 1. Versuch: wie geliefert, mit redirects/converttitles
|
||
res = _fetch_pageinfo_batch([title])
|
||
if res.get(title):
|
||
d = res[title]
|
||
return PageInfoResponse(pageid=d["pageid"], title=title, fullurl=d.get("fullurl", ""))
|
||
|
||
# 2. Varianten probieren
|
||
for v in _normalize_variants(title):
|
||
if v == title:
|
||
continue
|
||
res2 = _fetch_pageinfo_batch([v])
|
||
if res2.get(v):
|
||
d = res2[v]
|
||
return PageInfoResponse(pageid=d["pageid"], title=v, fullurl=d.get("fullurl", ""))
|
||
|
||
# 3. sauber 404
|
||
raise HTTPException(status_code=404, detail=f"Page not found: {title}")
|