mitai-jinkendo/backend/tests/test_csv_parser_core.py

"""Tests für CSV-Parser Foundation (Issue #21)."""

import pytest

from csv_parser.core import (
    decode_raw_bytes,
    sniff_delimiter,
    parse_csv_sample,
    column_signature,
    headers_signature_match_score,
    headers_signature_rank_metrics,
    get_csv_import_limits,
    iter_csv_dict_rows,
)
from csv_parser.field_units import source_unit_choices_for_field
from csv_parser.mapping_suggest import build_type_conversions_for_mapping
from csv_parser.type_converter import convert_value, build_row_after_mapping


def test_decode_bom_utf8():
    raw = "\ufeffa;b;c\n1;2;3".encode("utf-8-sig")
    t = decode_raw_bytes(raw)
    assert not t.startswith("\ufeff")
    assert "a;b;c" in t


def test_sniff_delimiter():
    assert sniff_delimiter("a;b;c;d") == ";"
    assert sniff_delimiter("a,b,c") == ","


def test_parse_csv_sample_header():
    text = "Date;kcal\n2024-01-01;2000\n"
    headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
    assert headers == ["Date", "kcal"]
    assert delim == ";"
    assert rows[0]["Date"] == "2024-01-01"
    assert rows[0]["kcal"] == "2000"


def test_parse_csv_sample_nbsp_in_header_matches_normal_space_key():
    """Excel/Apple: NBSP (U+00A0) im Spaltennamen — gleicher Key wie normales Leerzeichen."""
    text = "Aufgestiegene\u00a0Höhe (m);Wert\n12;3\n"
    headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
    assert headers == ["Aufgestiegene Höhe (m)", "Wert"]
    assert rows[0]["Aufgestiegene Höhe (m)"] == "12"


def test_iter_csv_dict_rows_nbsp_header_canonical():
    text = "col\u00a0one;b\n1;2\n"
    rows = list(iter_csv_dict_rows(text, ";", has_header=True))
    assert rows == [{"col one": "1", "b": "2"}]


def test_build_row_field_mapping_space_vs_nbsp_in_csv_header():
    """Vorlage (Dialog) mit normalem Leerzeichen, CSV mit NBSP — Zuordnung muss greifen."""
    csv_row = {"Aufgestiegene\u00a0Höhe (m)": "10"}
    fm = {"Aufgestiegene Höhe (m)": "stola"}
    tc = {"stola": {"type": "float", "decimal_separator": ".", "flexible": True}}
    out = build_row_after_mapping(csv_row, fm, tc, module="activity")
    assert out.get("stola") == 10.0


def test_column_signature_sorted_unique():
    sig = column_signature(["B", "a", "a"])
    assert sig == ["a", "b"]


def test_jaccard():
    s1 = column_signature(["Date", "Calories"])
    s2 = column_signature(["Date", "Calories", "Fat"])
    assert headers_signature_match_score(s1, s2) == pytest.approx(2 / 3)


def test_template_recall_full_when_csv_has_extra_columns():
    """Alle Template-Spalten in der CSV → Recall 1.0; Jaccard niedriger bei vielen Zusatzspalten."""
    csv_sig = column_signature(
        ["D", "E", "F", "Extra1", "Extra2", "Extra3", "Extra4", "Extra5"]
    )
    tmpl_sig = column_signature(["d", "e", "f"])
    m = headers_signature_rank_metrics(csv_sig, tmpl_sig)
    assert m["confidence"] == 1.0
    assert m["template_recall"] == 1.0
    assert m["columns_matched"] == 3
    assert m["columns_in_template"] == 3
    assert m["jaccard"] == pytest.approx(3 / 8)


def test_get_csv_import_limits_default():
    assert get_csv_import_limits(None)["max_rows_per_file"] == 50_000


def test_convert_date_and_kcal_factor():
    d = convert_value("15.01.2024", "date", {"type": "date", "format": "dd.mm.yyyy"})
    assert d.year == 2024 and d.month == 1 and d.day == 15

    k = convert_value("8000", "kcal", {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."})
    assert abs(k - 8000 * 0.239) < 0.01


def test_convert_kcal_via_source_unit_kj():
    spec = {"type": "float", "source_unit": "kj", "decimal_separator": "."}
    k = convert_value("4184", "kcal", spec, module="nutrition")
    assert abs(k - 1000.0) < 0.05


def test_apple_health_long_decimal_dot_preserved():
    """Apple: Mittel-HF u. Energie mit vielen Nachkommastellen; Punkt ist Dezimaltrenner."""
    hr_spec = {"type": "int", "flexible": True}
    r = convert_value("96.8749374730629", "hr_avg", hr_spec, module="activity")
    assert r == 97

    rest_spec = {"type": "float", "decimal_separator": ".", "flexible": True, "source_unit": "kj"}
    kcal = convert_value("596.6689047323086", "kcal_resting", rest_spec, module="activity")
    assert 140.0 < kcal < 145.0


def test_parse_float_auto_us_thousands_comma():
    """«12,345» ohne Dezimalpunkt weiter als Tausender möglich."""
    v = convert_value("12345", "x", {"type": "float", "decimal_separator": "auto"})
    assert v == 12345.0
    v2 = convert_value("12,345", "x", {"type": "float", "decimal_separator": "auto"})
    assert abs(v2 - 12345.0) < 0.01


def test_convert_protein_kg_to_g():
    spec = {"type": "float", "source_unit": "kg", "decimal_separator": "."}
    g = convert_value("0.1", "protein_g", spec, module="nutrition")
    assert abs(g - 100.0) < 0.001


def test_convert_custom_source_unit_only_conversion_factor():
    """Nicht vordefinierte Umrechnung: conversion_factor (optional mit source_unit: custom)."""
    spec = {"type": "float", "source_unit": "custom", "conversion_factor": 2.5, "decimal_separator": "."}
    k = convert_value("100", "kcal", spec, module="nutrition")
    assert abs(k - 250.0) < 0.001


def test_convert_unknown_source_unit_uses_conversion_factor_only():
    spec = {"type": "float", "source_unit": "exotic_unit", "conversion_factor": 0.5, "decimal_separator": "."}
    k = convert_value("200", "kcal", spec, module="nutrition")
    assert abs(k - 100.0) < 0.001


def test_build_row_source_unit_without_module_no_factor():
    """Ohne module bleibt source_unit wirkungslos (Abwärtskompatibilität)."""
    spec = {"type": "float", "source_unit": "kj", "decimal_separator": "."}
    k = convert_value("4184", "kcal", spec, module=None)
    assert abs(k - 4184.0) < 0.01


def test_iter_csv_dict_rows_full_file():
    text = "a;b\n1;2\n3;4\n"
    rows = list(iter_csv_dict_rows(text, ";", has_header=True))
    assert rows == [{"a": "1", "b": "2"}, {"a": "3", "b": "4"}]


def test_build_row_after_mapping():
    csv_row = {"Datum": "01.01.2024", "kj": "4200"}
    fm = {"Datum": "date", "kj": "kcal"}
    tc = {
        "date": {"type": "date", "format": "dd.mm.yyyy"},
        "kcal": {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."},
    }
    out = build_row_after_mapping(csv_row, fm, tc, module="nutrition")
    assert out["date"].month == 1
    assert out["kcal"] is not None
    assert abs(float(out["kcal"]) - 4200 * 0.239) < 0.02


def test_build_type_conversions_kj_header_sets_source_unit():
    fm = {"kJ": "kcal", "Datum": "date"}
    tc = build_type_conversions_for_mapping("nutrition", fm, None)
    assert tc["kcal"].get("source_unit") == "kj"


def test_build_row_fddb_raw_header_keys_match_normalized_template():
    """FDDB: DictReader liefert deutsche Überschrift, Seed nutzt normalisierten Key."""
    csv_row = {
        "Datum Tag Monat Jahr Stunde Minute": "01.01.2024 8:30",
        "kJ": "42000",
        "Fett (g)": "50",
        "KH (g)": "200",
        "Protein (g)": "100",
    }
    fm = {
        "datum_tag_monat_jahr_stunde_minute": "date",
        "kj": "kcal",
        "fett_g": "fat_g",
        "kh_g": "carbs_g",
        "protein_g": "protein_g",
    }
    tc = {
        "date": {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
        "kcal": {
            "type": "float",
            "source_unit": "kj",
            "decimal_separator": ",",
        },
        "fat_g": {"type": "float", "decimal_separator": ","},
        "carbs_g": {"type": "float", "decimal_separator": ","},
        "protein_g": {"type": "float", "decimal_separator": ","},
    }
    out = build_row_after_mapping(csv_row, fm, tc, module="nutrition")
    assert out["date"].year == 2024 and out["date"].month == 1 and out["date"].day == 1
    assert out["kcal"] is not None and abs(float(out["kcal"]) - (42000 / 4.184)) < 0.1


def test_build_row_apple_workout_elevation_header_prefix_matches_shorter_mapping_key():
    """Apple Workouts: „Aufgestiegene Höhe (m)“ normalisiert anders als manuell „aufgestiegene Höhe“."""
    csv_row = {"Aufgestiegene Höhe (m)": "47.13", "Workout Type": "Outdoor Spaziergang"}
    fm = {"aufgestiegene Höhe": "stola", "Workout Type": "activity_type"}
    tc = {"stola": {"type": "string"}}
    out = build_row_after_mapping(csv_row, fm, tc, module="activity")
    assert out.get("stola") == "47.13"
    assert out.get("activity_type") == "Outdoor Spaziergang"


def test_convert_date_ddmm_with_seconds():
    d = convert_value(
        "15.01.2024 14:30:00",
        "date",
        {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
    )
    assert d.month == 1 and d.day == 15


def test_float_decimal_separator_auto_eu_us():
    assert abs(convert_value("1.234,56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
    assert abs(convert_value("1,234.56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
    assert abs(convert_value("1234,5", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.5) < 1e-9


def test_float_flexible_falls_back_to_auto():
    spec = {"type": "float", "decimal_separator": ",", "flexible": True}
    assert abs(convert_value("1234.56", "x", spec) - 1234.56) < 1e-9


def test_date_flexible_iso_while_primary_ddmm():
    spec = {
        "type": "date",
        "format": "dd.mm.yyyy",
        "flexible": True,
        "extract": "date_only",
    }
    d1 = convert_value("2024-03-15", "d", spec)
    d2 = convert_value("15.03.2024", "d", spec)
    assert d1 == d2


def test_date_extra_formats_without_days_in_name():
    spec = {
        "type": "date",
        "format": "yyyy-mm-dd",
        "formats": ["%d.%m.%Y"],
        "extract": "date_only",
    }
    assert convert_value("08.04.2026", "d", spec).day == 8


def test_int_flexible_thousands():
    assert convert_value("1.234", "n", {"type": "int", "flexible": True}) == 1234


def test_build_row_after_mapping_column_order_independent():
    fm = {"Spalte B": "resting_hr", "Spalte A": "date"}
    tc = {
        "date": {"type": "date", "format": "yyyy-mm-dd", "flexible": True},
        "resting_hr": {"type": "int", "flexible": True},
    }
    r1 = build_row_after_mapping(
        {"Spalte A": "2026-01-15", "Spalte B": "58"}, fm, tc, module="vitals_baseline"
    )
    r2 = build_row_after_mapping(
        {"Spalte B": "58", "Spalte A": "2026-01-15"}, fm, tc, module="vitals_baseline"
    )
    assert r1 == r2
    assert r1["resting_hr"] == 58


def test_omron_report_date_formats_without_flexible_flag():
    """Omron «Bericht»-Export: engl. Month abbrev + deutscher Monatsname; Vorlage oft nur dd.mm.yyyy."""
    spec = {"type": "date", "format": "dd.mm.yyyy"}
    assert convert_value("10 Apr. 2026", "measured_date", spec).isoformat() == "2026-04-10"
    assert convert_value("31 März 2026", "measured_date", spec).isoformat() == "2026-03-31"
    assert convert_value("11 März 2026", "measured_date", spec).isoformat() == "2026-03-11"


def test_int_flexible_german_decimal_rounds():
    """Apple-DE: HRV/SpO2 als «37,26» / «95,22» — nicht 3726 aus Ziffern konkatenieren."""
    spec = {"type": "int", "flexible": True}
    assert convert_value("37,26", "hrv", spec) == 37
    assert convert_value("95,22", "spo2", spec) == 95


def test_datetime_flexible():
    spec = {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True}
    dtv = convert_value("15.01.2024 14:30:00", "t", spec)
    assert dtv.month == 1 and dtv.day == 15 and dtv.hour == 14


def test_apple_workout_datetime_without_seconds_iso_not_swapped():
    """Apple Export: 2026-04-09 16:48 — ohne :SS; kein dayfirst-Fehlparser (09↔04)."""
    spec = {
        "type": "datetime",
        "format": "yyyy-mm-dd HH:MM:SS",
        "extract": "date_and_time",
    }
    dtv = convert_value("2026-04-09 16:48", "start_time", spec, module="activity")
    assert dtv.year == 2026 and dtv.month == 4 and dtv.day == 9
    assert dtv.hour == 16 and dtv.minute == 48


def test_iso_yyyy_mm_dd_dateutil_fallback_not_dayfirst_swapped():
    """Nur dateutil: ISO-Datum darf mit Default-dayfirst nicht vertauscht werden."""
    spec = {"type": "date", "format": "dd.mm.yyyy", "flexible": True}
    d = convert_value("2026-04-09", "d", spec)
    assert d.month == 4 and d.day == 9


def test_activity_kcal_active_kj_source_unit_to_kcal():
    """Apple-DE: Aktive Energie als kJ; DB-Feld ist kcal."""
    spec = {
        "type": "float",
        "decimal_separator": ".",
        "flexible": True,
        "source_unit": "kj",
    }
    v = convert_value("4184", "kcal_active", spec, module="activity")
    assert abs(v - 1000.0) < 0.02


def test_source_unit_choices_include_custom_at_end():
    opts = source_unit_choices_for_field("nutrition", "protein_g")
    assert opts[-1]["id"] == "custom"
    assert any(o["id"] == "mg" for o in opts)