""" wiki_router.py – v1.4.1 (stabil & nachvollziehbar) Änderungen ggü. v1.4.0: - /info: Optionalen Request-Parameter entfernt (FastAPI/Pydantic Typfehler behoben) - Keine API-Signaturänderungen der Routen Ziele: - /semantic/pages reichert pageid/fullurl für ALLE Titel batchweise an (redirects=1, converttitles=1) - /info robust: 404 statt 500, mit Titel-Varianten - Wiederholungen & Throttling gegen MediaWiki - Optionale Diagnose-Ausgaben und Coverage-Kennzahlen Wenn ihr stattdessen den Prefix im Router setzen wollt, einfach in der APIRouter-Zeile unten prefix="/import/wiki" ergänzen und in main.py OHNE prefix einbinden. """ from typing import Dict, Any, Optional, List from fastapi import APIRouter, HTTPException, Query from pydantic import BaseModel import os, time, logging import requests from dotenv import load_dotenv load_dotenv() logger = logging.getLogger("wiki_router") logger.setLevel(logging.INFO) router = APIRouter(prefix="/import/wiki", tags=["wiki"]) # -------- Konfiguration -------- WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") WIKI_TIMEOUT = float(os.getenv("WIKI_TIMEOUT", "15")) WIKI_BATCH = int(os.getenv("WIKI_BATCH", "50")) WIKI_RETRIES = int(os.getenv("WIKI_RETRIES", "1")) # zusätzliche Versuche bei Upstream-Fehlern WIKI_SLEEPMS = int(os.getenv("WIKI_SLEEP_MS", "0")) # Throttle zwischen Requests # Single Session (Cookies für Login) wiki_session = requests.Session() wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.4.1"}) # -------- Schemas -------- class WikiLoginRequest(BaseModel): username: str password: str class WikiLoginResponse(BaseModel): status: str message: Optional[str] = None class PageInfoResponse(BaseModel): pageid: int title: str fullurl: str class PageContentResponse(BaseModel): pageid: int title: str wikitext: str # -------- Utils -------- def _sleep(): if WIKI_SLEEPMS > 0: time.sleep(WIKI_SLEEPMS / 1000.0) def _request_with_retry(method: str, params: Dict[str, Any], *, data: Dict[str, Any] | None = None) -> requests.Response: last_exc: Optional[Exception] = None for attempt in range(WIKI_RETRIES + 1): try: if method == "GET": resp = wiki_session.get(WIKI_API_URL, params=params, timeout=WIKI_TIMEOUT) else: resp = wiki_session.post(WIKI_API_URL, data=data or params, timeout=WIKI_TIMEOUT) resp.raise_for_status() return resp except Exception as e: last_exc = e logger.warning("Upstream error on %s (try %d/%d): %s", method, attempt + 1, WIKI_RETRIES + 1, e) _sleep() # alle Versuche erschöpft raise HTTPException(status_code=502, detail=f"Upstream error: {last_exc}") def _normalize_variants(title: str) -> List[str]: t = (title or "").strip() variants = {t} if " " in t: variants.add(t.replace(" ", "_")) # Bindestrich / Gedankenstrich Varianten for a, b in [("-", "–"), ("-", "—"), ("–", "-"), ("—", "-")]: if a in t: variants.add(t.replace(a, b)) return list(variants) def _fetch_pageinfo_batch(titles: List[str]) -> Dict[str, Dict[str, Any]]: if not titles: return {} out: Dict[str, Dict[str, Any]] = {} for i in range(0, len(titles), max(1, WIKI_BATCH)): chunk = titles[i:i + max(1, WIKI_BATCH)] params = { "action": "query", "format": "json", "prop": "info", "inprop": "url", "redirects": 1, "converttitles": 1, "titles": "|".join(chunk), } resp = _request_with_retry("GET", params) data = resp.json() or {} q = data.get("query", {}) redirects = {d.get("from"): d.get("to") for d in (q.get("redirects") or [])} pages = q.get("pages", {}) or {} for pid_str, page in pages.items(): if page.get("missing") is not None or str(pid_str) == "-1": continue try: pid = int(pid_str) except ValueError: pid = int(page.get("pageid", -1)) title_out = page.get("title") fullurl = page.get("fullurl") or page.get("canonicalurl") or "" if not title_out: continue out[title_out] = {"pageid": pid, "fullurl": fullurl} # auch Originaltitel der Redirects auflösen for frm, to in redirects.items(): if to == title_out and frm not in out: out[frm] = {"pageid": pid, "fullurl": fullurl} _sleep() return out # -------- Endpoints -------- @router.get("/health") def health(verbose: Optional[int] = Query(default=0)) -> Dict[str, Any]: # einfacher Ping resp = _request_with_retry("GET", {"action": "query", "meta": "siteinfo", "format": "json"}) if verbose: info = resp.json().get("query", {}).get("general", {}) return {"status": "ok", "wiki": {"sitename": info.get("sitename"), "generator": info.get("generator")}} return {"status": "ok"} @router.post("/login", response_model=WikiLoginResponse) def login(data: WikiLoginRequest): # Token holen tok = _request_with_retry("GET", {"action": "query", "meta": "tokens", "type": "login", "format": "json"}) token = tok.json().get("query", {}).get("tokens", {}).get("logintoken") if not token: raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") # clientlogin (mit returnurl) + Fallback action=login try: cl = _request_with_retry("POST", {}, data={ "action": "clientlogin", "format": "json", "username": data.username, "password": data.password, "logintoken": token, "loginreturnurl": "https://example.org/", }) st = cl.json().get("clientlogin", {}).get("status") if st == "PASS": return WikiLoginResponse(status="success") except HTTPException: pass lg = _request_with_retry("POST", {}, data={ "action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password, "lgtoken": token, }) res = lg.json().get("login", {}).get("result") if res == "Success": return WikiLoginResponse(status="success") raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {res}") @router.get("/semantic/pages") def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]: # Rohdaten aus SMW (Ask) ask_query = f"[[Category:{category}]]|limit=50000" r = _request_with_retry("GET", {"action": "ask", "query": ask_query, "format": "json"}) results = r.json().get("query", {}).get("results", {}) or {} titles = list(results.keys()) # Batch-Anreicherung mit pageid/fullurl für ALLE Titel info_map = _fetch_pageinfo_batch(titles) enriched: Dict[str, Any] = {} missing = 0 for title, entry in results.items(): base = entry if isinstance(entry, dict) else {} extra = info_map.get(title, {}) if not extra: missing += 1 enriched[title] = { **base, "pageid": extra.get("pageid", base.get("pageid")), "fullurl": extra.get("fullurl", base.get("fullurl")), } logger.info("/semantic/pages: %d Titel, %d ohne pageid nach Enrichment", len(results), missing) return enriched @router.get("/parsepage", response_model=PageContentResponse) def parse_page(pageid: int = Query(...), title: str = Query(None)): resp = _request_with_retry("GET", {"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}) wikitext = resp.json().get("parse", {}).get("wikitext", {}).get("*", "") return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext) @router.get("/info", response_model=PageInfoResponse) def page_info(title: str = Query(..., description="Seitentitel")): # 1. Versuch: wie geliefert, mit redirects/converttitles res = _fetch_pageinfo_batch([title]) if res.get(title): d = res[title] return PageInfoResponse(pageid=d["pageid"], title=title, fullurl=d.get("fullurl", "")) # 2. Varianten probieren for v in _normalize_variants(title): if v == title: continue res2 = _fetch_pageinfo_batch([v]) if res2.get(v): d = res2[v] return PageInfoResponse(pageid=d["pageid"], title=v, fullurl=d.get("fullurl", "")) # 3. sauber 404 raise HTTPException(status_code=404, detail=f"Page not found: {title}")