Trainer_LLM/llm-api/wiki_router.py
Lars a0d1b86b53
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
llm-api/wiki_router.py aktualisiert
Neue Funktion zur Stabilisierung von PageID

Yes—let’s lock this down properly. I’ve put a hardened wiki_router.py (v1.4.0) in the canvas. It:

enriches /semantic/pages with pageid + fullurl for every title (batching + redirects + converttitles),

makes /info tolerant (variants) and 404-safe,

adds retry + light throttling to all MediaWiki calls,

keeps the same routes and parameters (no breaking changes),

logs coverage so we can see where things go sideways.
2025-08-11 13:27:05 +02:00

235 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
wiki_router.py - v1.4.0 (stabil & nachvollziehbar)
Ziele:
- Keine API-Signaturänderungen (bestehende Routen bleiben)
- /semantic/pages reichert pageid/fullurl für ALLE Titel batchweise an (redirects=1, converttitles=1)
- /info robust: 404 statt 500, mit Titel-Varianten
- Wiederholungen & Throttling gegen MediaWiki
- Optionale Diagnose-Ausgaben und Coverage-Kennzahlen
Annahme: Der Router wird in main.py mit Prefix eingebunden:
app.include_router(wiki_router, prefix="/import/wiki")
Wenn ihr stattdessen den Prefix im Router setzen wollt, einfach in der APIRouter-Zeile unten
prefix="/import/wiki" ergänzen und in main.py OHNE prefix einbinden.
"""
from typing import Dict, Any, Optional, List, Tuple
from fastapi import APIRouter, HTTPException, Query, Request
from pydantic import BaseModel
import os, time, logging
import requests
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger("wiki_router")
logger.setLevel(logging.INFO)
router = APIRouter(tags=["wiki"]) # Prefix kommt aus main.py via include_router(..., prefix="/import/wiki")
# -------- Konfiguration --------
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php")
WIKI_TIMEOUT = float(os.getenv("WIKI_TIMEOUT", "15"))
WIKI_BATCH = int(os.getenv("WIKI_BATCH", "50"))
WIKI_RETRIES = int(os.getenv("WIKI_RETRIES", "1")) # zusätzliche Versuche bei Upstream-Fehlern
WIKI_SLEEPMS = int(os.getenv("WIKI_SLEEP_MS", "0")) # Throttle zwischen Requests
# Single Session (Cookies für Login)
wiki_session = requests.Session()
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.4"})
# -------- Schemas --------
class WikiLoginRequest(BaseModel):
username: str
password: str
class WikiLoginResponse(BaseModel):
status: str
message: Optional[str] = None
class PageInfoResponse(BaseModel):
pageid: int
title: str
fullurl: str
class PageContentResponse(BaseModel):
pageid: int
title: str
wikitext: str
# -------- Utils --------
def _sleep():
if WIKI_SLEEPMS > 0:
time.sleep(WIKI_SLEEPMS / 1000.0)
def _request_with_retry(method: str, params: Dict[str, Any], *, data: Dict[str, Any] | None = None) -> requests.Response:
last_exc: Optional[Exception] = None
for attempt in range(WIKI_RETRIES + 1):
try:
if method == "GET":
resp = wiki_session.get(WIKI_API_URL, params=params, timeout=WIKI_TIMEOUT)
else:
resp = wiki_session.post(WIKI_API_URL, data=data or params, timeout=WIKI_TIMEOUT)
resp.raise_for_status()
return resp
except Exception as e:
last_exc = e
logger.warning("Upstream error on %s (try %d/%d): %s", method, attempt + 1, WIKI_RETRIES + 1, e)
_sleep()
# alle Versuche erschöpft
raise HTTPException(status_code=502, detail=f"Upstream error: {last_exc}")
def _normalize_variants(title: str) -> List[str]:
t = (title or "").strip()
variants = {t}
if " " in t:
variants.add(t.replace(" ", "_"))
# Bindestrich / Gedankenstrich Varianten
for a, b in [("-", ""), ("-", ""), ("", "-"), ("", "-")]:
if a in t:
variants.add(t.replace(a, b))
return list(variants)
def _fetch_pageinfo_batch(titles: List[str]) -> Dict[str, Dict[str, Any]]:
if not titles:
return {}
out: Dict[str, Dict[str, Any]] = {}
for i in range(0, len(titles), max(1, WIKI_BATCH)):
chunk = titles[i:i + max(1, WIKI_BATCH)]
params = {
"action": "query",
"format": "json",
"prop": "info",
"inprop": "url",
"redirects": 1,
"converttitles": 1,
"titles": "|".join(chunk),
}
resp = _request_with_retry("GET", params)
data = resp.json() or {}
q = data.get("query", {})
redirects = {d.get("from"): d.get("to") for d in (q.get("redirects") or [])}
pages = q.get("pages", {}) or {}
for pid_str, page in pages.items():
if page.get("missing") is not None or str(pid_str) == "-1":
continue
try:
pid = int(pid_str)
except ValueError:
pid = int(page.get("pageid", -1))
title_out = page.get("title")
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
if not title_out:
continue
out[title_out] = {"pageid": pid, "fullurl": fullurl}
# auch Originaltitel der Redirects auflösen
for frm, to in redirects.items():
if to == title_out and frm not in out:
out[frm] = {"pageid": pid, "fullurl": fullurl}
_sleep()
return out
# -------- Endpoints --------
@router.get("/health")
def health(verbose: Optional[int] = Query(default=0)) -> Dict[str, Any]:
# einfacher Ping
resp = _request_with_retry("GET", {"action": "query", "meta": "siteinfo", "format": "json"})
if verbose:
info = resp.json().get("query", {}).get("general", {})
return {"status": "ok", "wiki": {"sitename": info.get("sitename"), "generator": info.get("generator")}}
return {"status": "ok"}
@router.post("/login", response_model=WikiLoginResponse)
def login(data: WikiLoginRequest):
# Token holen
tok = _request_with_retry("GET", {"action": "query", "meta": "tokens", "type": "login", "format": "json"})
token = tok.json().get("query", {}).get("tokens", {}).get("logintoken")
if not token:
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
# clientlogin (mit returnurl) + Fallback action=login
try:
cl = _request_with_retry("POST", {}, data={
"action": "clientlogin",
"format": "json",
"username": data.username,
"password": data.password,
"logintoken": token,
"loginreturnurl": "https://example.org/",
})
st = cl.json().get("clientlogin", {}).get("status")
if st == "PASS":
return WikiLoginResponse(status="success")
except HTTPException:
pass
lg = _request_with_retry("POST", {}, data={
"action": "login",
"format": "json",
"lgname": data.username,
"lgpassword": data.password,
"lgtoken": token,
})
res = lg.json().get("login", {}).get("result")
if res == "Success":
return WikiLoginResponse(status="success")
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {res}")
@router.get("/semantic/pages")
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
# Rohdaten aus SMW (Ask)
ask_query = f"[[Category:{category}]]|limit=50000"
r = _request_with_retry("GET", {"action": "ask", "query": ask_query, "format": "json"})
results = r.json().get("query", {}).get("results", {}) or {}
titles = list(results.keys())
# Batch-Anreicherung mit pageid/fullurl für ALLE Titel
info_map = _fetch_pageinfo_batch(titles)
enriched: Dict[str, Any] = {}
missing = 0
for title, entry in results.items():
base = entry if isinstance(entry, dict) else {}
extra = info_map.get(title, {})
if not extra:
missing += 1
enriched[title] = {
**base,
"pageid": extra.get("pageid", base.get("pageid")),
"fullurl": extra.get("fullurl", base.get("fullurl")),
}
logger.info("/semantic/pages: %d Titel, %d ohne pageid nach Enrichment", len(results), missing)
return enriched
@router.get("/parsepage", response_model=PageContentResponse)
def parse_page(pageid: int = Query(...), title: str = Query(None)):
resp = _request_with_retry("GET", {"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"})
wikitext = resp.json().get("parse", {}).get("wikitext", {}).get("*", "")
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
@router.get("/info", response_model=PageInfoResponse)
def page_info(title: str = Query(..., description="Seitentitel"), request: Request | None = None):
# 1. Versuch: wie geliefert, mit redirects/converttitles
res = _fetch_pageinfo_batch([title])
if res.get(title):
d = res[title]
return PageInfoResponse(pageid=d["pageid"], title=title, fullurl=d.get("fullurl", ""))
# 2. Varianten probieren
for v in _normalize_variants(title):
if v == title:
continue
res2 = _fetch_pageinfo_batch([v])
if res2.get(v):
d = res2[v]
return PageInfoResponse(pageid=d["pageid"], title=v, fullurl=d.get("fullurl", ""))
# 3. sauber 404
raise HTTPException(status_code=404, detail=f"Page not found: {title}")