""" Heuristische Vorschläge für CSV field_mappings / type_conversions (Admin-Editor, Issue #21). """ from __future__ import annotations from copy import deepcopy from typing import Any, Mapping from csv_parser.core import normalize_header_for_signature from csv_parser.module_registry import get_module_definition # Normalisierte Header-Fragmente → DB-Feld (Substring- oder exakter Norm-Vergleich) _MODULE_HEADER_ALIASES: dict[str, dict[str, frozenset[str]]] = { "nutrition": { "date": frozenset( {"datum", "date", "tag", "day", "zeit", "timestamp", "uhrzeit", "monat", "jahr"} ), "kcal": frozenset({"kcal", "kalorie", "calorie", "energie", "energy", "kj", "joule"}), "protein_g": frozenset({"protein", "eiwei", "eiweiss"}), "fat_g": frozenset({"fett", "fat", "lipid"}), "carbs_g": frozenset({"kh", "carb", "kohlenhydr", "carbs", "sugar", "zucker"}), }, "weight": { "date": frozenset({"datum", "date", "tag", "day", "zeit"}), "weight": frozenset({"gewicht", "weight", "masse", "kg", "kilo"}), "note": frozenset({"notiz", "note", "comment", "kommentar"}), }, "blood_pressure": { "measured_date": frozenset({"datum", "date", "tag", "day", "messdatum"}), "measured_time": frozenset({"zeit", "time", "uhr", "uhrzeit"}), "systolic": frozenset({"systol", "sys", "sbp", "oberdruck"}), "diastolic": frozenset({"diastol", "dia", "dbp", "unterdruck"}), "pulse": frozenset({"puls", "pulse", "hr", "herz", "bpm"}), }, "activity": { "date": frozenset({"datum", "date", "tag", "day"}), "start_time": frozenset({"start", "beginn", "von"}), "end_time": frozenset({"end", "ende", "bis", "stop"}), "activity_type": frozenset({"workout", "training", "typ", "type", "art", "aktiv"}), "duration_min": frozenset({"dauer", "duration", "min"}), "distance_km": frozenset({"strecke", "distance", "km", "distanz"}), "kcal_active": frozenset({"kcal", "kalorie", "energie", "active"}), "kcal_resting": frozenset({"ruhe", "resting"}), "hr_avg": frozenset({"puls", "heart", "hr", "bpm", "herzfrequenz", "durchschn"}), "hr_max": frozenset({"max", "peak"}), }, "vitals_baseline": { "date": frozenset({"datum", "date", "tag", "start", "zeit"}), "resting_hr": frozenset({"ruhepuls", "resting", "rhr"}), "hrv": frozenset({"hrv", "variabilit", "vfc"}), "vo2_max": frozenset({"vo2"}), "spo2": frozenset({"sauerstoff", "spo2", "oxygen"}), "respiratory_rate": frozenset({"atem", "respiratory"}), }, } _DEFAULT_TYPE_CONVERSIONS: dict[str, dict[str, dict[str, Any]]] = { "nutrition": { "date": {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only", "flexible": True}, "kcal": {"type": "float", "decimal_separator": "auto", "flexible": True}, "protein_g": {"type": "float", "decimal_separator": "auto", "flexible": True}, "fat_g": {"type": "float", "decimal_separator": "auto", "flexible": True}, "carbs_g": {"type": "float", "decimal_separator": "auto", "flexible": True}, }, "weight": { "date": {"type": "date", "format": "dd.mm.yyyy", "flexible": True}, "weight": {"type": "float", "decimal_separator": "auto", "flexible": True}, "note": {"type": "string"}, }, "blood_pressure": { "measured_date": {"type": "date", "format": "dd.mm.yyyy", "flexible": True}, "measured_time": {"type": "time", "format": "HH:MM", "flexible": True}, "systolic": {"type": "int", "flexible": True}, "diastolic": {"type": "int", "flexible": True}, "pulse": {"type": "int", "flexible": True}, }, "activity": { "date": {"type": "date", "format": "yyyy-mm-dd", "flexible": True}, "start_time": {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True}, "end_time": {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True}, "activity_type": {"type": "string"}, "duration_min": {"type": "duration", "format": "HH:MM:SS", "target_unit": "minutes", "flexible": True}, "distance_km": {"type": "float", "decimal_separator": "auto", "flexible": True}, "kcal_active": {"type": "float", "decimal_separator": "auto", "flexible": True}, "kcal_resting": {"type": "float", "decimal_separator": "auto", "flexible": True}, "hr_avg": {"type": "int", "flexible": True}, "hr_max": {"type": "int", "flexible": True}, }, "vitals_baseline": { "date": { "type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "extract": "date_only", "flexible": True, }, "resting_hr": {"type": "int", "flexible": True}, "hrv": {"type": "int", "flexible": True}, "vo2_max": {"type": "float", "decimal_separator": "auto", "flexible": True}, "spo2": {"type": "int", "flexible": True}, "respiratory_rate": {"type": "float", "decimal_separator": "auto", "flexible": True}, }, } def _norm_key(header: str) -> str: return normalize_header_for_signature(header) def _match_seed_to_db_field(header: str, seed_fm: Mapping[str, str]) -> str | None: """Findet Ziel-Feld, wenn Seed-Key zu diesem Header passt (exakt oder normalisiert).""" if header in seed_fm: v = seed_fm[header] if v and v not in ("-", "_skip"): return v nh = _norm_key(header) if nh in seed_fm: v = seed_fm[nh] if v and v not in ("-", "_skip"): return v for sk, sv in seed_fm.items(): if not sv or sv in ("-", "_skip"): continue if _norm_key(str(sk)) == nh: return sv return None def _alias_suggest(norm: str, module: str, used: set[str]) -> str | None: aliases = _MODULE_HEADER_ALIASES.get(module, {}) mod = get_module_definition(module) if not mod: return None field_order = list(mod["fields"].keys()) for db_field in field_order: if db_field in used: continue tokens = aliases.get(db_field, frozenset()) nlow = norm.lower() if nlow == db_field or nlow.replace("_", "") == db_field.replace("_", ""): return db_field for tok in tokens: if len(tok) >= 2 and tok in nlow: return db_field if len(tok) >= 4 and tok in norm: return db_field return None def suggest_field_mappings( headers: list[str], module: str, seed_fm: Mapping[str, str] | None = None, ) -> dict[str, str]: """ Mappt jede CSV-Spalte (Roh-Header als Key) auf DB-Feld oder '-'. Nutzt zuerst eine passende Seed-Vorlage, dann Alias-Heuristik. """ if module == "sleep": return {h: "-" for h in headers} mod = get_module_definition(module) if not mod: return {h: "-" for h in headers} fm: dict[str, str] = {h: "-" for h in headers} used: set[str] = set() if seed_fm: for h in headers: db = _match_seed_to_db_field(h, seed_fm) if db and db not in used: fm[h] = db used.add(db) for h in headers: if fm[h] != "-": continue norm = _norm_key(h) db = _alias_suggest(norm, module, used) if db: fm[h] = db used.add(db) return fm def build_type_conversions_for_mapping( module: str, field_mappings: Mapping[str, str], seed_tc: Mapping[str, Any] | None = None, ) -> dict[str, Any]: """type_conversions nur für zugewiesene Zielfelder; Seed überschreibt Defaults.""" if module == "sleep": return {} defaults = _DEFAULT_TYPE_CONVERSIONS.get(module, {}) out: dict[str, Any] = {} targets = {v for v in field_mappings.values() if v and v not in ("-", "_skip")} if seed_tc: for k, v in seed_tc.items(): if k in targets and isinstance(v, dict): out[k] = deepcopy(v) for t in targets: if t not in out and t in defaults: out[t] = deepcopy(defaults[t]) _apply_energy_kj_hint_from_headers(module, field_mappings, out) return out _ENERGY_FIELDS = frozenset({"kcal", "kcal_active", "kcal_resting"}) def _apply_energy_kj_hint_from_headers( module: str, field_mappings: Mapping[str, str], out: dict[str, Any], ) -> None: """Wenn Überschrift kJ/Kilojoule nahelegt (nicht kcal), source_unit kj setzen (FDDB & Co.).""" if module not in ("nutrition", "activity"): return for csv_col, db_field in field_mappings.items(): if db_field not in _ENERGY_FIELDS: continue spec = out.get(db_field) if not isinstance(spec, dict): continue if spec.get("source_unit"): continue norm = normalize_header_for_signature(str(csv_col)).lower() if "kcal" in norm: continue if "kj" in norm or "kilojoule" in norm: spec2 = deepcopy(spec) spec2["source_unit"] = "kj" out[db_field] = spec2