scripts/wiki_importer.py aktualisiert
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
All checks were successful
Deploy Trainer_LLM to llm-node / deploy (push) Successful in 2s
vorherige Version war lauffähig, brachte aber unsinnige Updates (Vergleich schlug fehl) Diese Version (v2.3.3) mit stabilem Fingerprint, Backcompat beim Vergleich und besserer Diagnostik
This commit is contained in:
parent
d8d12e0b6b
commit
2567d8c786
|
|
@ -2,34 +2,24 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Module: wiki_importer.py
|
||||
Status: stable
|
||||
Kurzbeschreibung:
|
||||
- Import von einzeln Übungen funktioniert
|
||||
- Import von allen Übungen funktioniert
|
||||
- Änderungsvergleich gegen Änderungen im Wiki funktioniert
|
||||
- Neue Übungen gegen die in qdrant gespeicherten werden identifiziert und angelegt
|
||||
|
||||
Beschreibung:
|
||||
- Importiert Übungen aus dem MediaWiki via FastAPI wiki_router
|
||||
- Führt vor dem Import einen Login gegen /import/wiki/login durch (falls nicht via --skip-login deaktiviert)
|
||||
- Holt Liste aller Übungs-Titel (SMW-Ask) via `/semantic/pages`
|
||||
- Für jede Übung:
|
||||
* Fetch pageinfo (pageid, fullurl) via `/info` (nur wenn nicht bereits geliefert)
|
||||
* Parse Wikitext (Templates: ÜbungInfoBox, Übungsbeschreibung, SkillDevelopment) via `/parsepage`
|
||||
* Baut Payload entsprechend Exercise-Datenmodell
|
||||
* Idempotentes Upsert: external_id="mw:{pageid}", Fingerprint (sha256) über Kernfelder,
|
||||
Lookup via `/exercise/by-external-id`, dann create/update/skip inkl. Zählern.
|
||||
- Unterstützt Single-Import via `--title` (oder ENV `WIKI_EXERCISE_TITLE`) und Full-Import via `--all`
|
||||
- Optional: Credentials via CLI (--username/--password) oder `.env` (WIKI_BOT_USER / WIKI_BOT_PASSWORD)
|
||||
- Smoke-Test (`--smoke-test`): 3 Läufe nacheinander (create → skip → update), ohne API-Signaturen zu ändern.
|
||||
|
||||
Version: 2.3.2
|
||||
Änderung: Regressionsfix in `process_all()` statt `isinstance(entry, dict)` wird nun generisch über
|
||||
`getattr(entry, "get", None)` auf `pageid/fullurl` zugegriffen (unterstützt Mappingähnliche Typen
|
||||
wie pydantic/OrderedDict/Mapping). So werden vorhandene pageids aus `/semantic/pages` wieder zuverlässig
|
||||
genutzt und unnötige `/info`Aufrufe vermieden.
|
||||
|
||||
- Login gegen /import/wiki/login (abschaltbar via --skip-login)
|
||||
- Titel-Liste via /semantic/pages, Parsing via /parsepage, Info via /info (nur wenn nötig)
|
||||
- Idempotentes Upsert: external_id="mw:{pageid}", Fingerprint (sha256) über Kernfelder
|
||||
- Lookup via /exercise/by-external-id, dann create/update/skip inkl. Zählern
|
||||
- Smoke-Test (--smoke-test): 3 Läufe (create → skip → update)
|
||||
|
||||
v2.3.3 – Änderungen ggü. 2.3.2:
|
||||
- Stabilerer Fingerprint (Kanonisierung & Whitespace-Normalisierung):
|
||||
• Titel: _ zu Leerzeichen, Gedankenstriche → Bindestrich
|
||||
• summary/execution/notes: Whitespace kollabieren
|
||||
• keywords: dedupliziert (case-insensitiv) & sortiert
|
||||
• duration_minutes: sicher als int
|
||||
- Backcompat beim Update-Entscheid: zusätzlich Neu-Berechnung des Fingerprints aus dem gefundenen Payload
|
||||
(verhindert False-Positives bei Altbeständen ohne/mit abweichendem Fingerprint)
|
||||
- Diagnostik: Gründe im Log (not found / unchanged / changed) und Feld-Diff bei Update
|
||||
- Kein API-/CLI-Bruch
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
@ -48,7 +38,7 @@ import time
|
|||
load_dotenv() # .env laden, falls vorhanden
|
||||
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000/import/wiki") # FastAPI-Wiki-Proxy
|
||||
EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint (Basis, ohne Slash am Ende)
|
||||
EXERCISE_API = os.getenv("EXERCISE_API_URL", "http://localhost:8000/exercise") # Exercise-Endpoint
|
||||
DEFAULT_CAT = os.getenv("WIKI_CATEGORY", "Übungen")
|
||||
DEFAULT_TITLE = os.getenv("WIKI_EXERCISE_TITLE", "Affenklatschen")
|
||||
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "60"))
|
||||
|
|
@ -62,10 +52,6 @@ def wiki_health() -> None:
|
|||
|
||||
|
||||
def wiki_login(username: str, password: str) -> None:
|
||||
"""
|
||||
Führt einen Login gegen den wiki_router durch.
|
||||
Erwartet: {"status":"success"} bei Erfolg.
|
||||
"""
|
||||
payload = {"username": username, "password": password}
|
||||
r = requests.post(f"{API_BASE_URL}/login", json=payload, timeout=30)
|
||||
try:
|
||||
|
|
@ -73,7 +59,6 @@ def wiki_login(username: str, password: str) -> None:
|
|||
except Exception:
|
||||
print(f"[Login] HTTP {r.status_code}: {r.text}")
|
||||
r.raise_for_status()
|
||||
|
||||
status = (data or {}).get("status")
|
||||
if status != "success":
|
||||
msg = (data or {}).get("message", "Login fehlgeschlagen")
|
||||
|
|
@ -130,7 +115,7 @@ def parse_exercise(title: str, pageid: int) -> Dict[str, Any]:
|
|||
return raw
|
||||
|
||||
|
||||
# ---- Fingerprint-Unterstützung ----
|
||||
# ---- Fingerprint-Unterstützung (stabil) ----
|
||||
|
||||
def _normalize(v: Any) -> str:
|
||||
if v is None:
|
||||
|
|
@ -142,16 +127,41 @@ def _normalize(v: Any) -> str:
|
|||
return str(v).strip()
|
||||
|
||||
|
||||
def _norm_text(s: str) -> str:
|
||||
if s is None:
|
||||
return ""
|
||||
s = str(s).replace("\u00a0", " ") # NBSP → Space
|
||||
s = s.strip()
|
||||
s = " ".join(s.split()) # Collapse whitespace
|
||||
return s
|
||||
|
||||
|
||||
def _canon_title(t: str) -> str:
|
||||
t = (t or "").strip().replace("_", " ")
|
||||
# Gedankenstriche vereinheitlichen
|
||||
return t.replace("–", "-").replace("—", "-")
|
||||
|
||||
|
||||
def compute_fingerprint(payload: Dict[str, Any]) -> str:
|
||||
"""sha256 über Kernfelder: title, summary, execution, notes, duration_minutes, capabilities, keywords"""
|
||||
# keywords stabilisieren: trim, dedupe (case-insensitiv), sort
|
||||
kws = payload.get("keywords") or []
|
||||
kws = sorted({(k or "").strip() for k in kws if (k or "").strip()}, key=str.casefold)
|
||||
|
||||
# dauer als int
|
||||
dur = payload.get("duration_minutes") or 0
|
||||
try:
|
||||
dur = int(round(float(dur)))
|
||||
except Exception:
|
||||
dur = 0
|
||||
|
||||
fields = [
|
||||
payload.get("title", ""),
|
||||
payload.get("summary", ""),
|
||||
payload.get("execution", ""),
|
||||
payload.get("notes", ""),
|
||||
payload.get("duration_minutes", 0),
|
||||
_canon_title(payload.get("title", "")),
|
||||
_norm_text(payload.get("summary", "")),
|
||||
_norm_text(payload.get("execution", "")),
|
||||
_norm_text(payload.get("notes", "")),
|
||||
dur,
|
||||
payload.get("capabilities", {}),
|
||||
payload.get("keywords", []),
|
||||
kws,
|
||||
]
|
||||
base = "|".join(_normalize(f) for f in fields)
|
||||
return hashlib.sha256(base.encode("utf-8")).hexdigest()
|
||||
|
|
@ -171,7 +181,6 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
|
|||
pass
|
||||
|
||||
# Defaults/Fallbacks
|
||||
duration = 0.0
|
||||
try:
|
||||
duration = float(raw.get("Dauer", 0) or 0)
|
||||
except Exception:
|
||||
|
|
@ -191,7 +200,6 @@ def build_payload(raw: Dict[str, Any], fullurl: str, category: str, *, mutate: b
|
|||
|
||||
notes = raw.get("Hinweise", "") or ""
|
||||
if mutate:
|
||||
# Für Smoke-Test (3. Lauf) geringfügige Änderung erzeugen
|
||||
notes = (notes + " [auto-update]").strip()
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
|
|
@ -236,26 +244,45 @@ def lookup_by_external_id(external_id: str) -> Tuple[Optional[Dict[str, Any]], O
|
|||
return {"error": str(e)}, None
|
||||
|
||||
|
||||
def _payload_subset_for_fp(p: Dict[str, Any]) -> Dict[str, Any]:
|
||||
return {
|
||||
"title": p.get("title"),
|
||||
"summary": p.get("summary"),
|
||||
"execution": p.get("execution"),
|
||||
"notes": p.get("notes"),
|
||||
"duration_minutes": p.get("duration_minutes"),
|
||||
"capabilities": p.get("capabilities") or {},
|
||||
"keywords": p.get("keywords") or [],
|
||||
}
|
||||
|
||||
|
||||
def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
||||
title = payload.get("title", "<ohne Titel>")
|
||||
ext_id = payload.get("external_id")
|
||||
fp_new = payload.get("fingerprint")
|
||||
|
||||
found, status = lookup_by_external_id(ext_id)
|
||||
|
||||
action = "create"
|
||||
if status == 404 or found is None:
|
||||
action = "create"
|
||||
elif isinstance(found, dict):
|
||||
fp_old = found.get("fingerprint") or found.get("payload", {}).get("fingerprint")
|
||||
if fp_old == fp_new:
|
||||
action = "skip"
|
||||
reason = "not found (lookup 404)"
|
||||
found_payload = {}
|
||||
|
||||
if not (status == 404 or found is None):
|
||||
if isinstance(found, dict):
|
||||
found_payload = found.get("payload", found)
|
||||
fp_old_stored = found.get("fingerprint") or found_payload.get("fingerprint")
|
||||
fp_old_recalc = compute_fingerprint(_payload_subset_for_fp(found_payload))
|
||||
if fp_new == fp_old_stored or fp_new == fp_old_recalc:
|
||||
action, reason = "skip", "fingerprint unchanged"
|
||||
else:
|
||||
action = "update"
|
||||
action, reason = "update", "fingerprint changed"
|
||||
else:
|
||||
action = "create"
|
||||
action, reason = "create", "unexpected lookup type"
|
||||
|
||||
if dry_run:
|
||||
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id})")
|
||||
print(f"[DryRun] {action.upper():6} '{title}' ({ext_id}) – {reason}")
|
||||
if action == "update":
|
||||
_print_diff(found_payload, payload)
|
||||
return action
|
||||
|
||||
if action == "create":
|
||||
|
|
@ -268,7 +295,7 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
|||
pass
|
||||
else:
|
||||
resp.raise_for_status()
|
||||
print(f"[Create] '{title}' -> OK")
|
||||
print(f"[Create] '{title}' – {reason} -> OK")
|
||||
elif action == "update":
|
||||
payload2 = dict(payload)
|
||||
payload2["imported_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
|
|
@ -281,12 +308,46 @@ def upsert_exercise(payload: Dict[str, Any], *, dry_run: bool = False) -> str:
|
|||
pass
|
||||
else:
|
||||
resp.raise_for_status()
|
||||
print(f"[Update] '{title}' -> OK")
|
||||
print(f"[Update] '{title}' – {reason} -> OK")
|
||||
_print_diff(found_payload, payload)
|
||||
else:
|
||||
print(f"[Skip] '{title}' (unverändert)")
|
||||
print(f"[Skip] '{title}' – {reason}")
|
||||
return action
|
||||
|
||||
|
||||
def _print_diff(before: Dict[str, Any], after: Dict[str, Any]) -> None:
|
||||
"""Kleines Feld-Diff für die Hash-Felder (Diagnose)."""
|
||||
keys = ["title","summary","execution","notes","duration_minutes","capabilities","keywords"]
|
||||
b = {k: before.get(k) for k in keys}
|
||||
a = {k: after.get(k) for k in keys}
|
||||
# für bessere Lesbarkeit normalisieren wir die Textfelder
|
||||
b_norm = {
|
||||
"title": _canon_title(b.get("title")),
|
||||
"summary": _norm_text(b.get("summary")),
|
||||
"execution": _norm_text(b.get("execution")),
|
||||
"notes": _norm_text(b.get("notes")),
|
||||
"duration_minutes": b.get("duration_minutes"),
|
||||
"capabilities": b.get("capabilities"),
|
||||
"keywords": sorted({(k or "").strip() for k in (b.get("keywords") or [])}, key=str.casefold),
|
||||
}
|
||||
a_norm = {
|
||||
"title": _canon_title(a.get("title")),
|
||||
"summary": _norm_text(a.get("summary")),
|
||||
"execution": _norm_text(a.get("execution")),
|
||||
"notes": _norm_text(a.get("notes")),
|
||||
"duration_minutes": a.get("duration_minutes"),
|
||||
"capabilities": a.get("capabilities"),
|
||||
"keywords": sorted({(k or "").strip() for k in (a.get("keywords") or [])}, key=str.casefold),
|
||||
}
|
||||
diff = {k: (b_norm[k], a_norm[k]) for k in keys if b_norm.get(k) != a_norm.get(k)}
|
||||
if diff:
|
||||
print("[Diff] changes:", json.dumps(diff, ensure_ascii=False))
|
||||
else:
|
||||
print("[Diff] (none in hash fields)")
|
||||
|
||||
|
||||
# ----- Orchestrierung -----
|
||||
|
||||
def process_one(title: str, category: str, *, mutate: bool = False, dry_run: bool = False) -> str:
|
||||
info = fetch_page_info(title)
|
||||
pid = info.get("pageid")
|
||||
|
|
@ -307,7 +368,6 @@ def process_all(category: str, *, dry_run: bool = False) -> Dict[str, int]:
|
|||
|
||||
for title, entry in pages.items():
|
||||
try:
|
||||
# Regressionsfix: generischer Zugriff auf Mapping‑ähnliche Einträge
|
||||
getter = getattr(entry, "get", None)
|
||||
if callable(getter):
|
||||
pid = getter("pageid")
|
||||
|
|
@ -374,7 +434,7 @@ def main() -> None:
|
|||
parser.add_argument("--username", type=str, default=os.getenv("WIKI_BOT_USER"), help="Wiki-Login Benutzer (überschreibt .env)")
|
||||
parser.add_argument("--password", type=str, default=os.getenv("WIKI_BOT_PASSWORD"), help="Wiki-Login Passwort (überschreibt .env)")
|
||||
parser.add_argument("--skip-login", action="store_true", help="Login-Schritt überspringen (falls Session schon aktiv)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) loggen")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Kein Schreiben; nur Entscheidungen (create/update/skip) + Gründe loggen")
|
||||
parser.add_argument("--smoke-test", action="store_true", help="3 Durchläufe (create→skip→update) für --title")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user