From 2567d8c7869113fded1be60e8e77b579cf63c0ba Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 11 Aug 2025 13:50:21 +0200 Subject: [PATCH] scripts/wiki_importer.py aktualisiert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vorherige Version war lauffähig, brachte aber unsinnige Updates (Vergleich schlug fehl) Diese Version (v2.3.3) mit stabilem Fingerprint, Backcompat beim Vergleich und besserer Diagnostik --- scripts/wiki_importer.py | 172 ++++++++++++++++++++++++++------------- 1 file changed, 116 insertions(+), 56 deletions(-) diff --git a/scripts/wiki_importer.py b/scripts/wiki_importer.py index 1d56b40..1e5eb7e 100644 --- a/scripts/wiki_importer.py +++ b/scripts/wiki_importer.py @@ -2,34 +2,24 @@ # -*- coding: utf-8 -*- """ Module: wiki_importer.py -Status: stable -Kurzbeschreibung: - - Import von einzeln Übungen funktioniert - - Import von allen Übungen funktioniert - - Änderungsvergleich gegen Änderungen im Wiki funktioniert - - Neue Übungen gegen die in qdrant gespeicherten werden identifiziert und angelegt - Beschreibung: - Importiert Übungen aus dem MediaWiki via FastAPI wiki_router -- Führt vor dem Import einen Login gegen /import/wiki/login durch (falls nicht via --skip-login deaktiviert) -- Holt Liste aller Übungs-Titel (SMW-Ask) via `/semantic/pages` -- Für jede Übung: - * Fetch pageinfo (pageid, fullurl) via `/info` (nur wenn nicht bereits geliefert) - * Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) via `/parsepage` - * Baut Payload entsprechend Exercise-Datenmodell - * Idempotentes Upsert: external_id="mw:{pageid}", Fingerprint (sha256) über Kernfelder, - Lookup via `/exercise/by-external-id`, dann create/update/skip inkl. Zählern. -- Unterstützt Single-Import via `--title` (oder ENV `WIKI_EXERCISE_TITLE`) und Full-Import via `--all` -- Optional: Credentials via CLI (--username/--password) oder `.env` (WIKI_BOT_USER / WIKI_BOT_PASSWORD) -- Smoke-Test (`--smoke-test`): 3 Läufe nacheinander (create → skip → update), ohne API-Signaturen zu ändern. - -Version: 2.3.2 -Änderung: Regressionsfix in `process_all()` statt `isinstance(entry, dict)` wird nun generisch über - `getattr(entry, "get", None)` auf `pageid/fullurl` zugegriffen (unterstützt Mappingähnliche Typen - wie pydantic/OrderedDict/Mapping). So werden vorhandene pageids aus `/semantic/pages` wieder zuverlässig - genutzt und unnötige `/info`Aufrufe vermieden. - +- Login gegen /import/wiki/login (abschaltbar via --skip-login) +- Titel-Liste via /semantic/pages, Parsing via /parsepage, Info via /info (nur wenn nötig) +- Idempotentes Upsert: external_id="mw:{pageid}", Fingerprint (sha256) über Kernfelder +- Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern +- Smoke-Test (--smoke-test): 3 Läufe (create → skip → update) +v2.3.3 – Änderungen ggü. 2.3.2: +- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung): + • Titel: _ zu Leerzeichen, Gedankenstriche → Bindestrich + • summary/execution/notes: Whitespace kollabieren + • keywords: dedupliziert (case-insensitiv) & sortiert + • duration_minutes: sicher als int +- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload + (verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint) +- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update +- Kein API-/CLI-Bruch """ import os @@ -48,7 +38,7 @@ import time load_dotenv() # .env laden, falls vorhanden API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") # FastAPI-Wiki-Proxy -EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint (Basis, ohne Slash am Ende) +EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen") DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen") REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60")) @@ -62,10 +52,6 @@ def wiki_health() -> None: def wiki_login(username: str, password: str) -> None: - """ - Führt einen Login gegen den wiki_router durch. - Erwartet: {"status":"success"} bei Erfolg. - """ payload = {"username": username, "password": password} r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30) try: @@ -73,7 +59,6 @@ def wiki_login(username: str, password: str) -> None: except Exception: print(f"[Login] HTTP {r.status_code}: {r.text}") r.raise_for_status() - status = (data or {}).get("status") if status != "success": msg = (data or {}).get("message", "Login fehlgeschlagen") @@ -130,7 +115,7 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]: return raw -# ---- Fingerprint-Unterstützung ---- +# ---- Fingerprint-Unterstützung (stabil) ---- def _normalize(v: Any) -> str: if v is None: @@ -142,16 +127,41 @@ def _normalize(v: Any) -> str: return str(v).strip() +def _norm_text(s: str) -> str: + if s is None: + return "" + s = str(s).replace("\u00a0", " ") # NBSP → Space + s = s.strip() + s = " ".join(s.split()) # Collapse whitespace + return s + + +def _canon_title(t: str) -> str: + t = (t or "").strip().replace("_", " ") + # Gedankenstriche vereinheitlichen + return t.replace("–", "-").replace("—", "-") + + def compute_fingerprint(payload: Dict[str, Any]) -> str: - """sha256 über Kernfelder: title, summary, execution, notes, duration_minutes, capabilities, keywords""" + # keywords stabilisieren: trim, dedupe (case-insensitiv), sort + kws = payload.get("keywords") or [] + kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold) + + # dauer als int + dur = payload.get("duration_minutes") or 0 + try: + dur = int(round(float(dur))) + except Exception: + dur = 0 + fields = [ - payload.get("title", ""), - payload.get("summary", ""), - payload.get("execution", ""), - payload.get("notes", ""), - payload.get("duration_minutes", 0), + _canon_title(payload.get("title", "")), + _norm_text(payload.get("summary", "")), + _norm_text(payload.get("execution", "")), + _norm_text(payload.get("notes", "")), + dur, payload.get("capabilities", {}), - payload.get("keywords", []), + kws, ] base = "|".join(_normalize(f) for f in fields) return hashlib.sha256(base.encode("utf-8")).hexdigest() @@ -171,7 +181,6 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b pass # Defaults/Fallbacks - duration = 0.0 try: duration = float(raw.get("Dauer", 0) or 0) except Exception: @@ -191,7 +200,6 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b notes = raw.get("Hinweise", "") or "" if mutate: - # Für Smoke-Test (3. Lauf) geringfügige Änderung erzeugen notes = (notes + " [auto-update]").strip() payload: Dict[str, Any] = { @@ -236,26 +244,45 @@ def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], O return {"error": str(e)}, None +def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]: + return { + "title": p.get("title"), + "summary": p.get("summary"), + "execution": p.get("execution"), + "notes": p.get("notes"), + "duration_minutes": p.get("duration_minutes"), + "capabilities": p.get("capabilities") or {}, + "keywords": p.get("keywords") or [], + } + + def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: title = payload.get("title", "") ext_id = payload.get("external_id") fp_new = payload.get("fingerprint") + found, status = lookup_by_external_id(ext_id) action = "create" - if status == 404 or found is None: - action = "create" - elif isinstance(found, dict): - fp_old = found.get("fingerprint") or found.get("payload", {}).get("fingerprint") - if fp_old == fp_new: - action = "skip" + reason = "not found (lookup 404)" + found_payload = {} + + if not (status == 404 or found is None): + if isinstance(found, dict): + found_payload = found.get("payload", found) + fp_old_stored = found.get("fingerprint") or found_payload.get("fingerprint") + fp_old_recalc = compute_fingerprint(_payload_subset_for_fp(found_payload)) + if fp_new == fp_old_stored or fp_new == fp_old_recalc: + action, reason = "skip", "fingerprint unchanged" + else: + action, reason = "update", "fingerprint changed" else: - action = "update" - else: - action = "create" + action, reason = "create", "unexpected lookup type" if dry_run: - print(f"[DryRun] {action.upper():6} '{title}' ({ext_id})") + print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}") + if action == "update": + _print_diff(found_payload, payload) return action if action == "create": @@ -268,7 +295,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: pass else: resp.raise_for_status() - print(f"[Create] '{title}' -> OK") + print(f"[Create] '{title}' – {reason} -> OK") elif action == "update": payload2 = dict(payload) payload2["imported_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) @@ -281,12 +308,46 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str: pass else: resp.raise_for_status() - print(f"[Update] '{title}' -> OK") + print(f"[Update] '{title}' – {reason} -> OK") + _print_diff(found_payload, payload) else: - print(f"[Skip] '{title}' (unverändert)") + print(f"[Skip] '{title}' – {reason}") return action +def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None: + """Kleines Feld-Diff für die Hash-Felder (Diagnose).""" + keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"] + b = {k: before.get(k) for k in keys} + a = {k: after.get(k) for k in keys} + # für bessere Lesbarkeit normalisieren wir die Textfelder + b_norm = { + "title": _canon_title(b.get("title")), + "summary": _norm_text(b.get("summary")), + "execution": _norm_text(b.get("execution")), + "notes": _norm_text(b.get("notes")), + "duration_minutes": b.get("duration_minutes"), + "capabilities": b.get("capabilities"), + "keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold), + } + a_norm = { + "title": _canon_title(a.get("title")), + "summary": _norm_text(a.get("summary")), + "execution": _norm_text(a.get("execution")), + "notes": _norm_text(a.get("notes")), + "duration_minutes": a.get("duration_minutes"), + "capabilities": a.get("capabilities"), + "keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold), + } + diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)} + if diff: + print("[Diff] changes:", json.dumps(diff, ensure_ascii=False)) + else: + print("[Diff] (none in hash fields)") + + +# ----- Orchestrierung ----- + def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str: info = fetch_page_info(title) pid = info.get("pageid") @@ -307,7 +368,6 @@ def process_all(category: str, *, dry_run: bool = False) -> Dict[str, int]: for title, entry in pages.items(): try: - # Regressionsfix: generischer Zugriff auf Mapping‑ähnliche Einträge getter = getattr(entry, "get", None) if callable(getter): pid = getter("pageid") @@ -374,7 +434,7 @@ def main() -> None: parser.add_argument("--username", type=str, default=os.getenv("WIKI_BOT_USER"), help="Wiki-Login Benutzer (überschreibt .env)") parser.add_argument("--password", type=str, default=os.getenv("WIKI_BOT_PASSWORD"), help="Wiki-Login Passwort (überschreibt .env)") parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)") - parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) loggen") + parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen") parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title") args = parser.parse_args()