Trainer_LLM/llm-api/wiki_router.py
Lars d8d12e0b6b
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
llm-api/wiki_router.py aktualisiert
2025-08-11 13:36:02 +02:00

236 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
wiki_router.py v1.4.1 (stabil & nachvollziehbar)
Änderungen ggü. v1.4.0:
- /info: Optionalen Request-Parameter entfernt (FastAPI/Pydantic Typfehler behoben)
- Keine API-Signaturänderungen der Routen
Ziele:
- /semantic/pages reichert pageid/fullurl für ALLE Titel batchweise an (redirects=1, converttitles=1)
- /info robust: 404 statt 500, mit Titel-Varianten
- Wiederholungen & Throttling gegen MediaWiki
- Optionale Diagnose-Ausgaben und Coverage-Kennzahlen
Wenn ihr stattdessen den Prefix im Router setzen wollt, einfach in der APIRouter-Zeile unten
prefix="/import/wiki" ergänzen und in main.py OHNE prefix einbinden.
"""
from typing import Dict, Any, Optional, List
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
import os, time, logging
import requests
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger("wiki_router")
logger.setLevel(logging.INFO)
router = APIRouter(prefix="/import/wiki", tags=["wiki"])
# -------- Konfiguration --------
WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php")
WIKI_TIMEOUT = float(os.getenv("WIKI_TIMEOUT", "15"))
WIKI_BATCH = int(os.getenv("WIKI_BATCH", "50"))
WIKI_RETRIES = int(os.getenv("WIKI_RETRIES", "1")) # zusätzliche Versuche bei Upstream-Fehlern
WIKI_SLEEPMS = int(os.getenv("WIKI_SLEEP_MS", "0")) # Throttle zwischen Requests
# Single Session (Cookies für Login)
wiki_session = requests.Session()
wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.4.1"})
# -------- Schemas --------
class WikiLoginRequest(BaseModel):
username: str
password: str
class WikiLoginResponse(BaseModel):
status: str
message: Optional[str] = None
class PageInfoResponse(BaseModel):
pageid: int
title: str
fullurl: str
class PageContentResponse(BaseModel):
pageid: int
title: str
wikitext: str
# -------- Utils --------
def _sleep():
if WIKI_SLEEPMS > 0:
time.sleep(WIKI_SLEEPMS / 1000.0)
def _request_with_retry(method: str, params: Dict[str, Any], *, data: Dict[str, Any] | None = None) -> requests.Response:
last_exc: Optional[Exception] = None
for attempt in range(WIKI_RETRIES + 1):
try:
if method == "GET":
resp = wiki_session.get(WIKI_API_URL, params=params, timeout=WIKI_TIMEOUT)
else:
resp = wiki_session.post(WIKI_API_URL, data=data or params, timeout=WIKI_TIMEOUT)
resp.raise_for_status()
return resp
except Exception as e:
last_exc = e
logger.warning("Upstream error on %s (try %d/%d): %s", method, attempt + 1, WIKI_RETRIES + 1, e)
_sleep()
# alle Versuche erschöpft
raise HTTPException(status_code=502, detail=f"Upstream error: {last_exc}")
def _normalize_variants(title: str) -> List[str]:
t = (title or "").strip()
variants = {t}
if " " in t:
variants.add(t.replace(" ", "_"))
# Bindestrich / Gedankenstrich Varianten
for a, b in [("-", ""), ("-", ""), ("", "-"), ("", "-")]:
if a in t:
variants.add(t.replace(a, b))
return list(variants)
def _fetch_pageinfo_batch(titles: List[str]) -> Dict[str, Dict[str, Any]]:
if not titles:
return {}
out: Dict[str, Dict[str, Any]] = {}
for i in range(0, len(titles), max(1, WIKI_BATCH)):
chunk = titles[i:i + max(1, WIKI_BATCH)]
params = {
"action": "query",
"format": "json",
"prop": "info",
"inprop": "url",
"redirects": 1,
"converttitles": 1,
"titles": "|".join(chunk),
}
resp = _request_with_retry("GET", params)
data = resp.json() or {}
q = data.get("query", {})
redirects = {d.get("from"): d.get("to") for d in (q.get("redirects") or [])}
pages = q.get("pages", {}) or {}
for pid_str, page in pages.items():
if page.get("missing") is not None or str(pid_str) == "-1":
continue
try:
pid = int(pid_str)
except ValueError:
pid = int(page.get("pageid", -1))
title_out = page.get("title")
fullurl = page.get("fullurl") or page.get("canonicalurl") or ""
if not title_out:
continue
out[title_out] = {"pageid": pid, "fullurl": fullurl}
# auch Originaltitel der Redirects auflösen
for frm, to in redirects.items():
if to == title_out and frm not in out:
out[frm] = {"pageid": pid, "fullurl": fullurl}
_sleep()
return out
# -------- Endpoints --------
@router.get("/health")
def health(verbose: Optional[int] = Query(default=0)) -> Dict[str, Any]:
# einfacher Ping
resp = _request_with_retry("GET", {"action": "query", "meta": "siteinfo", "format": "json"})
if verbose:
info = resp.json().get("query", {}).get("general", {})
return {"status": "ok", "wiki": {"sitename": info.get("sitename"), "generator": info.get("generator")}}
return {"status": "ok"}
@router.post("/login", response_model=WikiLoginResponse)
def login(data: WikiLoginRequest):
# Token holen
tok = _request_with_retry("GET", {"action": "query", "meta": "tokens", "type": "login", "format": "json"})
token = tok.json().get("query", {}).get("tokens", {}).get("logintoken")
if not token:
raise HTTPException(status_code=502, detail="Kein Login-Token erhalten")
# clientlogin (mit returnurl) + Fallback action=login
try:
cl = _request_with_retry("POST", {}, data={
"action": "clientlogin",
"format": "json",
"username": data.username,
"password": data.password,
"logintoken": token,
"loginreturnurl": "https://example.org/",
})
st = cl.json().get("clientlogin", {}).get("status")
if st == "PASS":
return WikiLoginResponse(status="success")
except HTTPException:
pass
lg = _request_with_retry("POST", {}, data={
"action": "login",
"format": "json",
"lgname": data.username,
"lgpassword": data.password,
"lgtoken": token,
})
res = lg.json().get("login", {}).get("result")
if res == "Success":
return WikiLoginResponse(status="success")
raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {res}")
@router.get("/semantic/pages")
def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]:
# Rohdaten aus SMW (Ask)
ask_query = f"[[Category:{category}]]|limit=50000"
r = _request_with_retry("GET", {"action": "ask", "query": ask_query, "format": "json"})
results = r.json().get("query", {}).get("results", {}) or {}
titles = list(results.keys())
# Batch-Anreicherung mit pageid/fullurl für ALLE Titel
info_map = _fetch_pageinfo_batch(titles)
enriched: Dict[str, Any] = {}
missing = 0
for title, entry in results.items():
base = entry if isinstance(entry, dict) else {}
extra = info_map.get(title, {})
if not extra:
missing += 1
enriched[title] = {
**base,
"pageid": extra.get("pageid", base.get("pageid")),
"fullurl": extra.get("fullurl", base.get("fullurl")),
}
logger.info("/semantic/pages: %d Titel, %d ohne pageid nach Enrichment", len(results), missing)
return enriched
@router.get("/parsepage", response_model=PageContentResponse)
def parse_page(pageid: int = Query(...), title: str = Query(None)):
resp = _request_with_retry("GET", {"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"})
wikitext = resp.json().get("parse", {}).get("wikitext", {}).get("*", "")
return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext)
@router.get("/info", response_model=PageInfoResponse)
def page_info(title: str = Query(..., description="Seitentitel")):
# 1. Versuch: wie geliefert, mit redirects/converttitles
res = _fetch_pageinfo_batch([title])
if res.get(title):
d = res[title]
return PageInfoResponse(pageid=d["pageid"], title=title, fullurl=d.get("fullurl", ""))
# 2. Varianten probieren
for v in _normalize_variants(title):
if v == title:
continue
res2 = _fetch_pageinfo_batch([v])
if res2.get(v):
d = res2[v]
return PageInfoResponse(pageid=d["pageid"], title=v, fullurl=d.get("fullurl", ""))
# 3. sauber 404
raise HTTPException(status_code=404, detail=f"Page not found: {title}")