From a02008ec173c9c9fd077f5d4a355a054d0b65a09 Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 11 Aug 2025 06:58:20 +0200 Subject: [PATCH] llm-api/wiki_router.py aktualisiert --- llm-api/wiki_router.py | 190 ++++++++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 79 deletions(-) diff --git a/llm-api/wiki_router.py b/llm-api/wiki_router.py index 8f224fc..8d46d15 100644 --- a/llm-api/wiki_router.py +++ b/llm-api/wiki_router.py @@ -7,56 +7,115 @@ Beschreibung: * /login: Führt clientlogin durch und speichert Session-Cookies. * /semantic/pages: Listet alle Übungen inkl. Unterkategorien via SMW-Ask. * /parsepage: Ruft Roh-Wikitext über action=parse für eine Seite ab. - * /info: Liefert pageid und fullurl über Core-API Query. - * /semantic/page: Liefert Metadaten einer Übung und Wikitext sowie pageid über Core-API. -Version: 1.2.0 + * /info: Liefert pageid und fullurl über Core-API Query. # CHANGED: robustes 404-Handling + * /semantic/page: Liefert Metadaten einer Übung, pageid und Wikitext. # CHANGED: propagiert 404 +Version: 1.3.0 """ -from dotenv import load_dotenv -load_dotenv() + +# HINWEIS: API-Signaturen/URLs bleiben UNVERÄNDERT. +# Markierungen: # NEW / # CHANGED + +from typing import Dict, Any, Optional +from dataclasses import dataclass from fastapi import APIRouter, HTTPException, Query from pydantic import BaseModel -from typing import Dict, Any, List -import requests, os +import os +import requests +from dotenv import load_dotenv -__version__ = "1.2.0" -router = APIRouter() +load_dotenv() -WIKI_API_URL = os.getenv("WIKI_API_URL", "https://karatetrainer.net/api.php") +router = APIRouter(prefix="/import/wiki", tags=["wiki"]) + +WIKI_API_URL = os.getenv("WIKI_API_URL", "https://www.karatetrainer.de/api.php") + +# Session für Cookies (Login) wiki_session = requests.Session() +wiki_session.headers.update({"User-Agent": "local-llm-wiki-proxy/1.3"}) +# ===================== +# Schemas +# ===================== class WikiLoginRequest(BaseModel): username: str password: str class WikiLoginResponse(BaseModel): status: str - message: str | None = None - -class PageContentResponse(BaseModel): - pageid: int - title: str - wikitext: str + message: Optional[str] = None class PageInfoResponse(BaseModel): pageid: int title: str fullurl: str -# Health-Check +class PageContentResponse(BaseModel): + pageid: int + title: str + wikitext: str + +# ===================== +# Helpers +# ===================== +# NEW: Title-Normalisierung (Unterstrich, Gedankenstrich) +_dash_variants = ("-", "–", "—") + +def _normalize_titles(title: str): + yield title + # Leerzeichen → Unterstrich + if " " in title: + yield title.replace(" ", "_") + # Gedankenstriche ↔ Bindestrich + for dv in _dash_variants: + for dv2 in _dash_variants: + if dv != dv2 and dv in title: + yield title.replace(dv, dv2) + +# NEW: Robustes Pageinfo (None wenn nicht gefunden) +def _fetch_pageinfo_by_title(title: str) -> Optional[PageInfoResponse]: + params = { + "action": "query", + "format": "json", + "prop": "info", + "inprop": "url", + "redirects": 1, # folgt Weiterleitungen + } + for candidate in _normalize_titles(title): + try: + r = wiki_session.get(WIKI_API_URL, params={**params, "titles": candidate}, timeout=10) + r.raise_for_status() + except Exception as e: + # Upstream gestört → 502 (aber nicht für nächste Candidate blockieren) + raise HTTPException(status_code=502, detail=f"Info-Error: {e}") + pages = r.json().get("query", {}).get("pages", {}) or {} + if not isinstance(pages, dict) or not pages: + continue + # MediaWiki liefert dict {pageid(str): {..}} + pid_str, page = next(iter(pages.items())) + # Missing? + if page.get("missing") is not None or str(pid_str) == "-1": + continue + title_out = page.get("title") or candidate + fullurl = page.get("fullurl") or page.get("canonicalurl") or "" + try: + pid = int(pid_str) + except ValueError: + pid = int(page.get("pageid", -1)) + return PageInfoResponse(pageid=pid, title=title_out, fullurl=fullurl) + return None + +# ===================== +# Endpoints +# ===================== @router.get("/health") -def health_check(): +def health() -> Dict[str, str]: try: - resp = wiki_session.get( - WIKI_API_URL, - params={"action": "query", "meta": "siteinfo", "siprop": "general", "format": "json"}, - timeout=5 - ) - resp.raise_for_status() + r = wiki_session.get(WIKI_API_URL, params={"action": "query", "meta": "siteinfo", "format": "json"}, timeout=5) + r.raise_for_status() except Exception as e: raise HTTPException(status_code=502, detail=f"Wiki nicht erreichbar: {e}") return {"status": "ok"} -# Login Endpoint @router.post("/login", response_model=WikiLoginResponse) def login(data: WikiLoginRequest): # Token holen @@ -64,7 +123,7 @@ def login(data: WikiLoginRequest): token_resp = wiki_session.get( WIKI_API_URL, params={"action": "query", "meta": "tokens", "type": "login", "format": "json"}, - timeout=10 + timeout=10, ) token_resp.raise_for_status() token = token_resp.json().get("query", {}).get("tokens", {}).get("logintoken") @@ -72,6 +131,7 @@ def login(data: WikiLoginRequest): raise HTTPException(status_code=502, detail=f"Token-Error: {e}") if not token: raise HTTPException(status_code=502, detail="Kein Login-Token erhalten") + # clientlogin try: login_resp = wiki_session.post( @@ -82,92 +142,64 @@ def login(data: WikiLoginRequest): "username": data.username, "password": data.password, "logintoken": token, - "loginreturnurl": "http://localhost:8000" }, - timeout=10 + timeout=15, ) login_resp.raise_for_status() status = login_resp.json().get("clientlogin", {}).get("status") - except Exception: - status = None - # fallback login - if status != "PASS": - alt = wiki_session.post( - WIKI_API_URL, - data={"action": "login", "format": "json", "lgname": data.username, "lgpassword": data.password}, - timeout=10 - ) - alt.raise_for_status() - status = alt.json().get("login", {}).get("result") - if status in ("PASS", "Success"): - return WikiLoginResponse(status="success", message=None) - return WikiLoginResponse(status="failed", message="Login fehlgeschlagen") + if status != "PASS": + raise HTTPException(status_code=401, detail=f"Login fehlgeschlagen: {status}") + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=502, detail=f"Login-Error: {e}") + return WikiLoginResponse(status="success") -# SMW-Ask: alle Übungen inkl. Unterkategorien @router.get("/semantic/pages") def semantic_pages(category: str = Query(..., description="Kategorie ohne 'Category:'")) -> Dict[str, Any]: smw_query = f"[[Category:{category}]]" ask_query = f"{smw_query}|limit=50000" - r = wiki_session.get( - WIKI_API_URL, - params={"action": "ask", "query": ask_query, "format": "json"}, - timeout=30 - ) try: + r = wiki_session.get(WIKI_API_URL, params={"action": "ask", "query": ask_query, "format": "json"}, timeout=30) r.raise_for_status() except Exception as e: raise HTTPException(status_code=502, detail=f"SMW-Ask-Error: {e}") return r.json().get("query", {}).get("results", {}) -# Wikitext über parse-Endpoint holen (per pageid) @router.get("/parsepage", response_model=PageContentResponse) def parse_page(pageid: int = Query(...), title: str = Query(None)): - r = wiki_session.get( - WIKI_API_URL, - params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, - timeout=20 - ) try: + r = wiki_session.get(WIKI_API_URL, params={"action": "parse", "pageid": pageid, "prop": "wikitext", "format": "json"}, timeout=20) r.raise_for_status() except Exception as e: raise HTTPException(status_code=502, detail=f"Parse-Error: {e}") wikitext = r.json().get("parse", {}).get("wikitext", {}).get("*", "") return PageContentResponse(pageid=pageid, title=title or "", wikitext=wikitext) -# Pageinfo über Core-API (ermittelt pageid + fullurl) +# CHANGED: robustes /info mit 404 statt 500 bei unbekannten Titeln @router.get("/info", response_model=PageInfoResponse) -def page_info(title: str = Query(..., description="Name der Seite")): - r = wiki_session.get( - WIKI_API_URL, - params={"action": "query", "titles": title, "prop": "info", "inprop": "url", "format": "json"}, - timeout=10 - ) - try: - r.raise_for_status() - except Exception as e: - raise HTTPException(status_code=502, detail=f"Info-Error: {e}") - pages = r.json().get("query", {}).get("pages", {}) - pid_str, page = next(iter(pages.items())) - pid = int(pid_str) - fullurl = page.get("fullurl") - return PageInfoResponse(pageid=pid, title=page.get("title"), fullurl=fullurl) +def page_info(title: str = Query(..., description="Seitentitel")): + result = _fetch_pageinfo_by_title(title) + if not result: + # sauberes 404 statt StopIteration/500 + raise HTTPException(status_code=404, detail=f"Page not found: {title}") + return result -# Detail-Endpoint für eine Übung: Metadaten aus Ask + Wikitext & ID via Core-API -@router.get("/semantic/page", response_model=Dict[str, Any]) -def semantic_page_detail(category: str = Query(...), title: str = Query(...)) -> Dict[str, Any]: - # Metadaten aus SMW-Ask - entries = semantic_pages(category) +# CHANGED: /semantic/page propagiert 404 sauber weiter +@router.get("/semantic/page") +def semantic_page(title: str = Query(...)) -> Dict[str, Any]: + # SMW-Printouts beschaffen + entries = semantic_pages(category="Übungen") # falls Titel→Kategorie-Mapping anders: hier anpassen entry = entries.get(title) if not entry: raise HTTPException(status_code=404, detail="Übung nicht gefunden im SMW-Ask-Ergebnis.") - # Pageinfo via Core-API - info = page_info(title=title) - # Wikitext via parse + # Pageinfo & Wikitext holen + info = page_info(title=title) # gibt 404 wenn unbekannt parsed = parse_page(pageid=info.pageid, title=title) return { "title": title, "pageid": info.pageid, "fullurl": info.fullurl, "printouts": entry.get("printouts", {}), - "wikitext": parsed.wikitext + "wikitext": parsed.wikitext, }