mitai-jinkendo/backend/csv_parser/mapping_suggest.py
Lars c10da55ec6
All checks were successful
Deploy Development / deploy (push) Successful in 56s
Build Test / lint-backend (push) Successful in 0s
Build Test / build-frontend (push) Successful in 15s
feat(csv-templates): Introduce CSV template analysis and validation features
- Added a new endpoint for analyzing uploaded CSV files, providing suggestions for field mappings and type conversions.
- Implemented validation for required field targets to ensure all mandatory fields are mapped correctly.
- Enhanced the admin CSV templates interface with new routes and navigation options in the frontend.
- Updated API utility functions to support the new CSV analysis functionality.
- Improved error handling for CSV uploads, including file size and row count checks.
2026-04-10 06:39:41 +02:00

180 lines
6.8 KiB
Python

"""
Heuristische Vorschläge für CSV field_mappings / type_conversions (Admin-Editor, Issue #21).
"""
from __future__ import annotations
from copy import deepcopy
from typing import Any, Mapping
from csv_parser.core import normalize_header_for_signature
from csv_parser.module_registry import get_module_definition
# Normalisierte Header-Fragmente → DB-Feld (Substring- oder exakter Norm-Vergleich)
_MODULE_HEADER_ALIASES: dict[str, dict[str, frozenset[str]]] = {
"nutrition": {
"date": frozenset(
{"datum", "date", "tag", "day", "zeit", "timestamp", "uhrzeit", "monat", "jahr"}
),
"kcal": frozenset({"kcal", "kalorie", "calorie", "energie", "energy", "kj", "joule"}),
"protein_g": frozenset({"protein", "eiwei", "eiweiss"}),
"fat_g": frozenset({"fett", "fat", "lipid"}),
"carbs_g": frozenset({"kh", "carb", "kohlenhydr", "carbs", "sugar", "zucker"}),
},
"weight": {
"date": frozenset({"datum", "date", "tag", "day", "zeit"}),
"weight": frozenset({"gewicht", "weight", "masse", "kg", "kilo"}),
"note": frozenset({"notiz", "note", "comment", "kommentar"}),
},
"blood_pressure": {
"measured_date": frozenset({"datum", "date", "tag", "day", "messdatum"}),
"measured_time": frozenset({"zeit", "time", "uhr", "uhrzeit"}),
"systolic": frozenset({"systol", "sys", "sbp", "oberdruck"}),
"diastolic": frozenset({"diastol", "dia", "dbp", "unterdruck"}),
"pulse": frozenset({"puls", "pulse", "hr", "herz", "bpm"}),
},
"activity": {
"date": frozenset({"datum", "date", "tag", "day"}),
"start_time": frozenset({"start", "beginn", "von"}),
"end_time": frozenset({"end", "ende", "bis", "stop"}),
"activity_type": frozenset({"workout", "training", "typ", "type", "art", "aktiv"}),
"duration_min": frozenset({"dauer", "duration", "min"}),
"distance_km": frozenset({"strecke", "distance", "km", "distanz"}),
"kcal_active": frozenset({"kcal", "kalorie", "energie", "active"}),
"hr_avg": frozenset({"puls", "heart", "hr", "bpm", "herzfrequenz"}),
},
}
_DEFAULT_TYPE_CONVERSIONS: dict[str, dict[str, dict[str, Any]]] = {
"nutrition": {
"date": {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only", "flexible": True},
"kcal": {"type": "float", "decimal_separator": "auto", "flexible": True},
"protein_g": {"type": "float", "decimal_separator": "auto", "flexible": True},
"fat_g": {"type": "float", "decimal_separator": "auto", "flexible": True},
"carbs_g": {"type": "float", "decimal_separator": "auto", "flexible": True},
},
"weight": {
"date": {"type": "date", "format": "dd.mm.yyyy", "flexible": True},
"weight": {"type": "float", "decimal_separator": "auto", "flexible": True},
"note": {"type": "string"},
},
"blood_pressure": {
"measured_date": {"type": "date", "format": "dd.mm.yyyy", "flexible": True},
"measured_time": {"type": "time", "format": "HH:MM", "flexible": True},
"systolic": {"type": "int", "flexible": True},
"diastolic": {"type": "int", "flexible": True},
"pulse": {"type": "int", "flexible": True},
},
"activity": {
"date": {"type": "date", "format": "yyyy-mm-dd", "flexible": True},
"start_time": {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True},
"end_time": {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True},
"activity_type": {"type": "string"},
"duration_min": {"type": "duration", "format": "HH:MM:SS", "target_unit": "minutes", "flexible": True},
"distance_km": {"type": "float", "decimal_separator": "auto", "flexible": True},
"kcal_active": {"type": "float", "decimal_separator": "auto", "flexible": True},
"hr_avg": {"type": "int", "flexible": True},
},
}
def _norm_key(header: str) -> str:
return normalize_header_for_signature(header)
def _match_seed_to_db_field(header: str, seed_fm: Mapping[str, str]) -> str | None:
"""Findet Ziel-Feld, wenn Seed-Key zu diesem Header passt (exakt oder normalisiert)."""
if header in seed_fm:
v = seed_fm[header]
if v and v not in ("-", "_skip"):
return v
nh = _norm_key(header)
if nh in seed_fm:
v = seed_fm[nh]
if v and v not in ("-", "_skip"):
return v
for sk, sv in seed_fm.items():
if not sv or sv in ("-", "_skip"):
continue
if _norm_key(str(sk)) == nh:
return sv
return None
def _alias_suggest(norm: str, module: str, used: set[str]) -> str | None:
aliases = _MODULE_HEADER_ALIASES.get(module, {})
mod = get_module_definition(module)
if not mod:
return None
field_order = list(mod["fields"].keys())
for db_field in field_order:
if db_field in used:
continue
tokens = aliases.get(db_field, frozenset())
nlow = norm.lower()
if nlow == db_field or nlow.replace("_", "") == db_field.replace("_", ""):
return db_field
for tok in tokens:
if len(tok) >= 2 and tok in nlow:
return db_field
if len(tok) >= 4 and tok in norm:
return db_field
return None
def suggest_field_mappings(
headers: list[str],
module: str,
seed_fm: Mapping[str, str] | None = None,
) -> dict[str, str]:
"""
Mappt jede CSV-Spalte (Roh-Header als Key) auf DB-Feld oder '-'.
Nutzt zuerst eine passende Seed-Vorlage, dann Alias-Heuristik.
"""
mod = get_module_definition(module)
if not mod:
return {h: "-" for h in headers}
fm: dict[str, str] = {h: "-" for h in headers}
used: set[str] = set()
if seed_fm:
for h in headers:
db = _match_seed_to_db_field(h, seed_fm)
if db and db not in used:
fm[h] = db
used.add(db)
for h in headers:
if fm[h] != "-":
continue
norm = _norm_key(h)
db = _alias_suggest(norm, module, used)
if db:
fm[h] = db
used.add(db)
return fm
def build_type_conversions_for_mapping(
module: str,
field_mappings: Mapping[str, str],
seed_tc: Mapping[str, Any] | None = None,
) -> dict[str, Any]:
"""type_conversions nur für zugewiesene Zielfelder; Seed überschreibt Defaults."""
defaults = _DEFAULT_TYPE_CONVERSIONS.get(module, {})
out: dict[str, Any] = {}
targets = {v for v in field_mappings.values() if v and v not in ("-", "_skip")}
if seed_tc:
for k, v in seed_tc.items():
if k in targets and isinstance(v, dict):
out[k] = deepcopy(v)
for t in targets:
if t not in out and t in defaults:
out[t] = deepcopy(defaults[t])
return out