shinkan-jinkendo/backend/smw_mapper.py
Lars e6ce7e241c
Some checks failed
Deploy Development / deploy (push) Successful in 54s
Test Suite / lint-backend (push) Successful in 0s
Test Suite / build-frontend (push) Successful in 5s
Test Suite / playwright-tests (push) Failing after 1m55s
fix: convert skill target_level to INTEGER instead of string
- build_skill_assignments() now returns INTEGER (1-5) for target_level
- Previously returned string names ('einsteiger', 'grundlagen', etc.)
- Caused 91/95 import failures with 'invalid input syntax for type integer'
- Remaining 4 failures are wiki pages missing both goal AND execution (invalid data)

Issue: Wiki import - dict-cursor fixes phase 2
2026-04-27 09:30:29 +02:00

374 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Semantic MediaWiki → Shinkan Field Mapper
Wandelt SMW-Properties von karatetrainer.net in lokale DB-Felder um.
Property-Namen wurden via discover_properties() auf echten Wiki-Seiten ermittelt.
Entdeckte Kategorien:
Übungen: Kategorie: Übungen (auch "Übungen Karate", "Übungen allgemein")
Fähigkeiten: Fähigkeitsbeschreibung
Methoden: Methodenbeschreibung
"""
import re
import logging
from typing import Optional
logger = logging.getLogger(__name__)
# ------------------------------------------------------------------ #
# CapabilityLevel Integer → benannte Stufen #
# ------------------------------------------------------------------ #
# Mapping: SMW-Integer → Shinkan-Stufenname
CAPABILITY_LEVEL_MAP = {
"1": "einsteiger",
"2": "grundlagen",
"3": "aufbau",
"4": "fortgeschritten",
"5": "experte",
}
# ------------------------------------------------------------------ #
# SMW Property → lokales Feld #
# Echte Namen von karatetrainer.net (via discover_properties) #
# ------------------------------------------------------------------ #
# Übungen (exercises)
EXERCISE_PROPERTY_MAP = {
# Kern-Felder
"Übungsbezeichnung": "title_override", # Übungsname (bevorzugt ggü. Seitentitel)
"Ziel": "goal",
"Durchführung": "execution",
"Summary": "summary",
"Hinweise": "trainer_notes",
"Plandauer": "duration_raw", # Zahl in Minuten z.B. "10"
"Gruppengröße": "group_size_raw", # Zahl z.B. "2"
"Hilfsmittel": "equipment_raw", # Komma-Liste / einzelner Wert
"Schlüsselworte": "keywords_raw", # Keywords (nicht direkt in DB, für spätere Tags)
# Katalog-Felder (Name → ID Lookup)
"Übungstyp": "focus_area_names", # "Karate" → focus_area
"Zielgruppe": "target_group_names",
"Altersgruppe": "age_group_names",
"Trainingsmethode": "method_names", # Wiki-Seitenname z.B. "Plyometrisches_Training"
# Fähigkeiten (als Namen + Level)
"PrimaryCapability": "skill_names", # Skill-Namen (können mehrere sein)
"CapabilityLevel": "skill_levels_raw", # Integer-Levels ["3", "2"] → aufbau, grundlagen
# Weitere Felder (optional)
"Graduierung": "graduierung", # "0 - Anfänger" (zukünftige Nutzung)
"Lernstufe": "lernstufe", # "Lernstufe_1_-_Erlernen_und_Festigen"
}
# Fähigkeiten (skills) Kategorie: Fähigkeitsbeschreibung
SKILL_PROPERTY_MAP = {
"Summary": "description",
"KarateRelevanz": "karate_relevance", # Wird in description ergänzt
"RelevanzLevel": "relevance_level", # 1-3, nicht direkt in skills DB
}
# Trainingsmethoden Kategorie: Methodenbeschreibung
METHOD_PROPERTY_MAP = {
"Summary": "description",
"Kurzbezeichnung": "code", # Abkürzung z.B. "DM"
"KarateRelevanz": "karate_relevance",
"PrimaryCapability": "skill_names", # Verknüpfte Fähigkeiten
}
# ------------------------------------------------------------------ #
# Wikitext → Plaintext #
# ------------------------------------------------------------------ #
def wikitext_to_plaintext(wikitext: str) -> str:
"""Entfernt Wikitext-Formatierungen und gibt lesbaren Plaintext zurück."""
text = wikitext
# Externe Links: [https://example.com Text] → Text
text = re.sub(r'\[https?://\S+\s+([^\]]+)\]', r'\1', text)
# Interne Links mit Alias: [[Link|Text]] → Text
text = re.sub(r'\[\[([^|\]]+)\|([^\]]+)\]\]', r'\2', text)
# Interne Links ohne Alias: [[Link]] → Link (Unterstriche → Leerzeichen)
text = re.sub(r'\[\[([^\]]+)\]\]', lambda m: m.group(1).replace('_', ' '), text)
# Templates entfernen (einzeilig)
text = re.sub(r'\{\{[^}]+\}\}', '', text)
# Fettdruck und Kursiv
text = re.sub(r"'''(.+?)'''", r'\1', text)
text = re.sub(r"''(.+?)''", r'\1', text)
# HTML-Tags entfernen (inkl. <br>)
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
text = re.sub(r'<[^>]+>', '', text)
# Überschriften
text = re.sub(r'={2,6}\s*(.+?)\s*={2,6}', r'\n\1\n', text)
# Aufzählungszeichen normalisieren
text = re.sub(r'^[*#:;]+\s*', '- ', text, flags=re.MULTILINE)
# Mehrfache Leerzeilen normalisieren
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def wiki_name_to_label(wiki_name: str) -> str:
"""Wandelt Wiki-Seitennamen in lesbare Labels um: Plyometrisches_Training → Plyometrisches Training"""
return wiki_name.replace('_', ' ').strip()
# ------------------------------------------------------------------ #
# Parsing-Hilfsfunktionen #
# ------------------------------------------------------------------ #
def parse_duration(raw: str) -> tuple[Optional[int], Optional[int]]:
"""
"10" → (10, 10) (Plandauer ist immer eine einzelne Zahl in Minuten)
"10-15" → (10, 15)
"""
if not raw:
return None, None
numbers = re.findall(r'\d+', raw)
if not numbers:
return None, None
if len(numbers) == 1:
val = int(numbers[0])
return val, val
return int(numbers[0]), int(numbers[1])
def parse_group_size(raw: str) -> tuple[Optional[int], Optional[int]]:
"""
"2" → (2, 2) (Gruppengröße ist immer eine einzelne Zahl)
"""
if not raw:
return None, None
numbers = re.findall(r'\d+', raw)
if not numbers:
return None, None
val = int(numbers[0])
if len(numbers) == 1:
return val, None # Minimum, kein Maximum angegeben
return int(numbers[0]), int(numbers[1])
def parse_equipment(raw: list[str]) -> list[str]:
"""Normalisiert Equipment-Liste: ["Ausdruck"] oder ["Gewicht"] → bereinigt"""
result = []
for item in raw:
for part in re.split(r'[,;/]', item):
cleaned = wiki_name_to_label(part.strip())
if cleaned:
result.append(cleaned)
return result
def map_capability_level(level_str: str) -> str:
"""Wandelt Integer-Level in benannte Stufe: "3""aufbau" """
return CAPABILITY_LEVEL_MAP.get(level_str.strip(), "einsteiger")
# ------------------------------------------------------------------ #
# Haupt-Mapping-Funktion #
# ------------------------------------------------------------------ #
def map_wiki_to_exercise(
page_title: str,
wiki_page_id: Optional[int],
smw_props: dict,
) -> dict:
"""
Wandelt SMW-Properties einer Wiki-Seite in ein Exercise-Dict um.
Args:
page_title: Titel der Wiki-Seite (Fallback für title)
wiki_page_id: Interne MediaWiki-Seiten-ID
smw_props: {property_name: [value, ...]} aus SmwClient.browse_subject()
Returns:
Dict mit gemappten Feldern + Katalog-Listen für ID-Lookup.
"""
mapped: dict = {
"title": page_title,
"wiki_page_id": wiki_page_id,
# Tracking
"import_source": "mediawiki",
"import_id": page_title,
# Defaults
"visibility": "private",
"status": "draft",
# Katalog-Referenzen (Name → ID-Lookup erfolgt im Router)
"focus_area_names": [],
"target_group_names": [],
"age_group_names": [],
"skill_names": [],
"skill_levels_raw": [], # Integer-Strings ["3", "2"]
"method_names": [],
# Equipment
"equipment": [],
# Warnungen für unbekannte Katalog-Werte
"warnings": [],
}
for prop_name, values in smw_props.items():
if not values:
continue
target = EXERCISE_PROPERTY_MAP.get(prop_name)
if not target:
continue
# Ersten Wert oder ganzes Array
first_value = values[0] if isinstance(values, list) else values
if target == "title_override":
mapped["title"] = wiki_name_to_label(first_value)
elif target in ("goal", "execution", "summary", "trainer_notes"):
mapped[target] = wikitext_to_plaintext(first_value)
elif target == "duration_raw":
dur_min, dur_max = parse_duration(first_value)
mapped["duration_min"] = dur_min
mapped["duration_max"] = dur_max
elif target == "group_size_raw":
gs_min, gs_max = parse_group_size(first_value)
mapped["group_size_min"] = gs_min
mapped["group_size_max"] = gs_max
elif target == "equipment_raw":
mapped["equipment"] = parse_equipment(values if isinstance(values, list) else [values])
elif target == "keywords_raw":
# Keywords für spätere Tag-Implementierung speichern
mapped["keywords"] = [wiki_name_to_label(v) for v in (values if isinstance(values, list) else [values])]
elif target == "focus_area_names":
mapped["focus_area_names"] = [wiki_name_to_label(v) for v in (values if isinstance(values, list) else [values])]
elif target == "target_group_names":
mapped["target_group_names"] = [wiki_name_to_label(v) for v in (values if isinstance(values, list) else [values])]
elif target == "age_group_names":
mapped["age_group_names"] = [wiki_name_to_label(v) for v in (values if isinstance(values, list) else [values])]
elif target == "method_names":
mapped["method_names"] = [wiki_name_to_label(v) for v in (values if isinstance(values, list) else [values])]
elif target == "skill_names":
mapped["skill_names"] = [wiki_name_to_label(v) for v in (values if isinstance(values, list) else [values])]
elif target == "skill_levels_raw":
mapped["skill_levels_raw"] = list(values) if isinstance(values, list) else [values]
return mapped
def build_skill_assignments(mapped: dict) -> list[dict]:
"""
Erstellt Skill-Zuordnungen aus PrimaryCapability + CapabilityLevel.
CapabilityLevel [3, 2] korrespondiert mit PrimaryCapability [Schnellkraft, Schnelligkeitsausdauer]
→ ergibt: [{skill: Schnellkraft, target_level: 3}, {skill: Schnelligkeitsausdauer, target_level: 2}]
WICHTIG: target_level ist INTEGER (1-5), nicht String!
"""
skills = mapped.get("skill_names", [])
levels = mapped.get("skill_levels_raw", [])
assignments = []
for idx, skill_name in enumerate(skills):
level_str = levels[idx] if idx < len(levels) else "1"
# Konvertiere zu INTEGER statt String-Namen
try:
target_level = int(level_str.strip())
except (ValueError, AttributeError):
target_level = 1 # Fallback
assignments.append({
"skill_name": skill_name,
"target_level": target_level, # INTEGER 1-5
"required_level": None, # Nicht im Wiki spezifiziert
"intensity": None, # Nicht im Wiki spezifiziert
"is_primary": idx == 0,
})
return assignments
def map_wiki_to_skill(
page_title: str,
wiki_page_id: Optional[int],
smw_props: dict,
) -> dict:
"""Wandelt SMW-Properties einer Fähigkeitsbeschreibung-Seite in ein Skill-Dict um."""
mapped = {
"name": page_title,
"wiki_page_id": wiki_page_id,
"import_source": "mediawiki",
"import_id": page_title,
"warnings": [],
}
description_parts = []
for prop_name, values in smw_props.items():
if not values:
continue
target = SKILL_PROPERTY_MAP.get(prop_name)
if not target:
continue
first_value = values[0] if isinstance(values, list) else values
if target == "description":
description_parts.insert(0, wikitext_to_plaintext(first_value))
elif target == "karate_relevance":
rel = wikitext_to_plaintext(first_value)
description_parts.append(f"\nKarate-Relevanz: {rel}")
if description_parts:
mapped["description"] = "\n".join(description_parts).strip()
return mapped
def map_wiki_to_method(
page_title: str,
wiki_page_id: Optional[int],
smw_props: dict,
) -> dict:
"""Wandelt SMW-Properties einer Methodenbeschreibung-Seite in ein Method-Dict um."""
mapped = {
"name": page_title,
"wiki_page_id": wiki_page_id,
"import_source": "mediawiki",
"import_id": page_title,
"warnings": [],
}
description_parts = []
for prop_name, values in smw_props.items():
if not values:
continue
target = METHOD_PROPERTY_MAP.get(prop_name)
if not target:
continue
first_value = values[0] if isinstance(values, list) else values
if target == "description":
description_parts.insert(0, wikitext_to_plaintext(first_value))
elif target == "code":
mapped["code"] = first_value.strip()
elif target == "karate_relevance":
rel = wikitext_to_plaintext(first_value)
description_parts.append(f"\nKarate-Relevanz: {rel}")
if description_parts:
mapped["description"] = "\n".join(description_parts).strip()
return mapped