- Introduced a new utility function `canonical_csv_header_label` to standardize CSV header labels, improving consistency in field mapping. - Updated the `_lookup_db_field` function to support prefix matching for longer manual keys, enhancing the accuracy of field resolution. - Added tests to validate handling of non-breaking space characters in CSV headers and ensure correct mapping to normalized keys, improving robustness of CSV parsing.
337 lines
13 KiB
Python
337 lines
13 KiB
Python
"""Tests für CSV-Parser Foundation (Issue #21)."""
|
|
|
|
import pytest
|
|
|
|
from csv_parser.core import (
|
|
decode_raw_bytes,
|
|
sniff_delimiter,
|
|
parse_csv_sample,
|
|
column_signature,
|
|
headers_signature_match_score,
|
|
headers_signature_rank_metrics,
|
|
get_csv_import_limits,
|
|
iter_csv_dict_rows,
|
|
)
|
|
from csv_parser.field_units import source_unit_choices_for_field
|
|
from csv_parser.mapping_suggest import build_type_conversions_for_mapping
|
|
from csv_parser.type_converter import convert_value, build_row_after_mapping
|
|
|
|
|
|
def test_decode_bom_utf8():
|
|
raw = "\ufeffa;b;c\n1;2;3".encode("utf-8-sig")
|
|
t = decode_raw_bytes(raw)
|
|
assert not t.startswith("\ufeff")
|
|
assert "a;b;c" in t
|
|
|
|
|
|
def test_sniff_delimiter():
|
|
assert sniff_delimiter("a;b;c;d") == ";"
|
|
assert sniff_delimiter("a,b,c") == ","
|
|
|
|
|
|
def test_parse_csv_sample_header():
|
|
text = "Date;kcal\n2024-01-01;2000\n"
|
|
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
|
assert headers == ["Date", "kcal"]
|
|
assert delim == ";"
|
|
assert rows[0]["Date"] == "2024-01-01"
|
|
assert rows[0]["kcal"] == "2000"
|
|
|
|
|
|
def test_parse_csv_sample_nbsp_in_header_matches_normal_space_key():
|
|
"""Excel/Apple: NBSP (U+00A0) im Spaltennamen — gleicher Key wie normales Leerzeichen."""
|
|
text = "Aufgestiegene\u00a0Höhe (m);Wert\n12;3\n"
|
|
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
|
assert headers == ["Aufgestiegene Höhe (m)", "Wert"]
|
|
assert rows[0]["Aufgestiegene Höhe (m)"] == "12"
|
|
|
|
|
|
def test_iter_csv_dict_rows_nbsp_header_canonical():
|
|
text = "col\u00a0one;b\n1;2\n"
|
|
rows = list(iter_csv_dict_rows(text, ";", has_header=True))
|
|
assert rows == [{"col one": "1", "b": "2"}]
|
|
|
|
|
|
def test_build_row_field_mapping_space_vs_nbsp_in_csv_header():
|
|
"""Vorlage (Dialog) mit normalem Leerzeichen, CSV mit NBSP — Zuordnung muss greifen."""
|
|
csv_row = {"Aufgestiegene\u00a0Höhe (m)": "10"}
|
|
fm = {"Aufgestiegene Höhe (m)": "stola"}
|
|
tc = {"stola": {"type": "float", "decimal_separator": ".", "flexible": True}}
|
|
out = build_row_after_mapping(csv_row, fm, tc, module="activity")
|
|
assert out.get("stola") == 10.0
|
|
|
|
|
|
def test_column_signature_sorted_unique():
|
|
sig = column_signature(["B", "a", "a"])
|
|
assert sig == ["a", "b"]
|
|
|
|
|
|
def test_jaccard():
|
|
s1 = column_signature(["Date", "Calories"])
|
|
s2 = column_signature(["Date", "Calories", "Fat"])
|
|
assert headers_signature_match_score(s1, s2) == pytest.approx(2 / 3)
|
|
|
|
|
|
def test_template_recall_full_when_csv_has_extra_columns():
|
|
"""Alle Template-Spalten in der CSV → Recall 1.0; Jaccard niedriger bei vielen Zusatzspalten."""
|
|
csv_sig = column_signature(
|
|
["D", "E", "F", "Extra1", "Extra2", "Extra3", "Extra4", "Extra5"]
|
|
)
|
|
tmpl_sig = column_signature(["d", "e", "f"])
|
|
m = headers_signature_rank_metrics(csv_sig, tmpl_sig)
|
|
assert m["confidence"] == 1.0
|
|
assert m["template_recall"] == 1.0
|
|
assert m["columns_matched"] == 3
|
|
assert m["columns_in_template"] == 3
|
|
assert m["jaccard"] == pytest.approx(3 / 8)
|
|
|
|
|
|
def test_get_csv_import_limits_default():
|
|
assert get_csv_import_limits(None)["max_rows_per_file"] == 50_000
|
|
|
|
|
|
def test_convert_date_and_kcal_factor():
|
|
d = convert_value("15.01.2024", "date", {"type": "date", "format": "dd.mm.yyyy"})
|
|
assert d.year == 2024 and d.month == 1 and d.day == 15
|
|
|
|
k = convert_value("8000", "kcal", {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."})
|
|
assert abs(k - 8000 * 0.239) < 0.01
|
|
|
|
|
|
def test_convert_kcal_via_source_unit_kj():
|
|
spec = {"type": "float", "source_unit": "kj", "decimal_separator": "."}
|
|
k = convert_value("4184", "kcal", spec, module="nutrition")
|
|
assert abs(k - 1000.0) < 0.05
|
|
|
|
|
|
def test_apple_health_long_decimal_dot_preserved():
|
|
"""Apple: Mittel-HF u. Energie mit vielen Nachkommastellen; Punkt ist Dezimaltrenner."""
|
|
hr_spec = {"type": "int", "flexible": True}
|
|
r = convert_value("96.8749374730629", "hr_avg", hr_spec, module="activity")
|
|
assert r == 97
|
|
|
|
rest_spec = {"type": "float", "decimal_separator": ".", "flexible": True, "source_unit": "kj"}
|
|
kcal = convert_value("596.6689047323086", "kcal_resting", rest_spec, module="activity")
|
|
assert 140.0 < kcal < 145.0
|
|
|
|
|
|
def test_parse_float_auto_us_thousands_comma():
|
|
"""«12,345» ohne Dezimalpunkt weiter als Tausender möglich."""
|
|
v = convert_value("12345", "x", {"type": "float", "decimal_separator": "auto"})
|
|
assert v == 12345.0
|
|
v2 = convert_value("12,345", "x", {"type": "float", "decimal_separator": "auto"})
|
|
assert abs(v2 - 12345.0) < 0.01
|
|
|
|
|
|
def test_convert_protein_kg_to_g():
|
|
spec = {"type": "float", "source_unit": "kg", "decimal_separator": "."}
|
|
g = convert_value("0.1", "protein_g", spec, module="nutrition")
|
|
assert abs(g - 100.0) < 0.001
|
|
|
|
|
|
def test_convert_custom_source_unit_only_conversion_factor():
|
|
"""Nicht vordefinierte Umrechnung: conversion_factor (optional mit source_unit: custom)."""
|
|
spec = {"type": "float", "source_unit": "custom", "conversion_factor": 2.5, "decimal_separator": "."}
|
|
k = convert_value("100", "kcal", spec, module="nutrition")
|
|
assert abs(k - 250.0) < 0.001
|
|
|
|
|
|
def test_convert_unknown_source_unit_uses_conversion_factor_only():
|
|
spec = {"type": "float", "source_unit": "exotic_unit", "conversion_factor": 0.5, "decimal_separator": "."}
|
|
k = convert_value("200", "kcal", spec, module="nutrition")
|
|
assert abs(k - 100.0) < 0.001
|
|
|
|
|
|
def test_build_row_source_unit_without_module_no_factor():
|
|
"""Ohne module bleibt source_unit wirkungslos (Abwärtskompatibilität)."""
|
|
spec = {"type": "float", "source_unit": "kj", "decimal_separator": "."}
|
|
k = convert_value("4184", "kcal", spec, module=None)
|
|
assert abs(k - 4184.0) < 0.01
|
|
|
|
|
|
def test_iter_csv_dict_rows_full_file():
|
|
text = "a;b\n1;2\n3;4\n"
|
|
rows = list(iter_csv_dict_rows(text, ";", has_header=True))
|
|
assert rows == [{"a": "1", "b": "2"}, {"a": "3", "b": "4"}]
|
|
|
|
|
|
def test_build_row_after_mapping():
|
|
csv_row = {"Datum": "01.01.2024", "kj": "4200"}
|
|
fm = {"Datum": "date", "kj": "kcal"}
|
|
tc = {
|
|
"date": {"type": "date", "format": "dd.mm.yyyy"},
|
|
"kcal": {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."},
|
|
}
|
|
out = build_row_after_mapping(csv_row, fm, tc, module="nutrition")
|
|
assert out["date"].month == 1
|
|
assert out["kcal"] is not None
|
|
assert abs(float(out["kcal"]) - 4200 * 0.239) < 0.02
|
|
|
|
|
|
def test_build_type_conversions_kj_header_sets_source_unit():
|
|
fm = {"kJ": "kcal", "Datum": "date"}
|
|
tc = build_type_conversions_for_mapping("nutrition", fm, None)
|
|
assert tc["kcal"].get("source_unit") == "kj"
|
|
|
|
|
|
def test_build_row_fddb_raw_header_keys_match_normalized_template():
|
|
"""FDDB: DictReader liefert deutsche Überschrift, Seed nutzt normalisierten Key."""
|
|
csv_row = {
|
|
"Datum Tag Monat Jahr Stunde Minute": "01.01.2024 8:30",
|
|
"kJ": "42000",
|
|
"Fett (g)": "50",
|
|
"KH (g)": "200",
|
|
"Protein (g)": "100",
|
|
}
|
|
fm = {
|
|
"datum_tag_monat_jahr_stunde_minute": "date",
|
|
"kj": "kcal",
|
|
"fett_g": "fat_g",
|
|
"kh_g": "carbs_g",
|
|
"protein_g": "protein_g",
|
|
}
|
|
tc = {
|
|
"date": {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
|
|
"kcal": {
|
|
"type": "float",
|
|
"source_unit": "kj",
|
|
"decimal_separator": ",",
|
|
},
|
|
"fat_g": {"type": "float", "decimal_separator": ","},
|
|
"carbs_g": {"type": "float", "decimal_separator": ","},
|
|
"protein_g": {"type": "float", "decimal_separator": ","},
|
|
}
|
|
out = build_row_after_mapping(csv_row, fm, tc, module="nutrition")
|
|
assert out["date"].year == 2024 and out["date"].month == 1 and out["date"].day == 1
|
|
assert out["kcal"] is not None and abs(float(out["kcal"]) - (42000 / 4.184)) < 0.1
|
|
|
|
|
|
def test_build_row_apple_workout_elevation_header_prefix_matches_shorter_mapping_key():
|
|
"""Apple Workouts: „Aufgestiegene Höhe (m)“ normalisiert anders als manuell „aufgestiegene Höhe“."""
|
|
csv_row = {"Aufgestiegene Höhe (m)": "47.13", "Workout Type": "Outdoor Spaziergang"}
|
|
fm = {"aufgestiegene Höhe": "stola", "Workout Type": "activity_type"}
|
|
tc = {"stola": {"type": "string"}}
|
|
out = build_row_after_mapping(csv_row, fm, tc, module="activity")
|
|
assert out.get("stola") == "47.13"
|
|
assert out.get("activity_type") == "Outdoor Spaziergang"
|
|
|
|
|
|
def test_convert_date_ddmm_with_seconds():
|
|
d = convert_value(
|
|
"15.01.2024 14:30:00",
|
|
"date",
|
|
{"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
|
|
)
|
|
assert d.month == 1 and d.day == 15
|
|
|
|
|
|
def test_float_decimal_separator_auto_eu_us():
|
|
assert abs(convert_value("1.234,56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
|
|
assert abs(convert_value("1,234.56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
|
|
assert abs(convert_value("1234,5", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.5) < 1e-9
|
|
|
|
|
|
def test_float_flexible_falls_back_to_auto():
|
|
spec = {"type": "float", "decimal_separator": ",", "flexible": True}
|
|
assert abs(convert_value("1234.56", "x", spec) - 1234.56) < 1e-9
|
|
|
|
|
|
def test_date_flexible_iso_while_primary_ddmm():
|
|
spec = {
|
|
"type": "date",
|
|
"format": "dd.mm.yyyy",
|
|
"flexible": True,
|
|
"extract": "date_only",
|
|
}
|
|
d1 = convert_value("2024-03-15", "d", spec)
|
|
d2 = convert_value("15.03.2024", "d", spec)
|
|
assert d1 == d2
|
|
|
|
|
|
def test_date_extra_formats_without_days_in_name():
|
|
spec = {
|
|
"type": "date",
|
|
"format": "yyyy-mm-dd",
|
|
"formats": ["%d.%m.%Y"],
|
|
"extract": "date_only",
|
|
}
|
|
assert convert_value("08.04.2026", "d", spec).day == 8
|
|
|
|
|
|
def test_int_flexible_thousands():
|
|
assert convert_value("1.234", "n", {"type": "int", "flexible": True}) == 1234
|
|
|
|
|
|
def test_build_row_after_mapping_column_order_independent():
|
|
fm = {"Spalte B": "resting_hr", "Spalte A": "date"}
|
|
tc = {
|
|
"date": {"type": "date", "format": "yyyy-mm-dd", "flexible": True},
|
|
"resting_hr": {"type": "int", "flexible": True},
|
|
}
|
|
r1 = build_row_after_mapping(
|
|
{"Spalte A": "2026-01-15", "Spalte B": "58"}, fm, tc, module="vitals_baseline"
|
|
)
|
|
r2 = build_row_after_mapping(
|
|
{"Spalte B": "58", "Spalte A": "2026-01-15"}, fm, tc, module="vitals_baseline"
|
|
)
|
|
assert r1 == r2
|
|
assert r1["resting_hr"] == 58
|
|
|
|
|
|
def test_omron_report_date_formats_without_flexible_flag():
|
|
"""Omron «Bericht»-Export: engl. Month abbrev + deutscher Monatsname; Vorlage oft nur dd.mm.yyyy."""
|
|
spec = {"type": "date", "format": "dd.mm.yyyy"}
|
|
assert convert_value("10 Apr. 2026", "measured_date", spec).isoformat() == "2026-04-10"
|
|
assert convert_value("31 März 2026", "measured_date", spec).isoformat() == "2026-03-31"
|
|
assert convert_value("11 März 2026", "measured_date", spec).isoformat() == "2026-03-11"
|
|
|
|
|
|
def test_int_flexible_german_decimal_rounds():
|
|
"""Apple-DE: HRV/SpO2 als «37,26» / «95,22» — nicht 3726 aus Ziffern konkatenieren."""
|
|
spec = {"type": "int", "flexible": True}
|
|
assert convert_value("37,26", "hrv", spec) == 37
|
|
assert convert_value("95,22", "spo2", spec) == 95
|
|
|
|
|
|
def test_datetime_flexible():
|
|
spec = {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True}
|
|
dtv = convert_value("15.01.2024 14:30:00", "t", spec)
|
|
assert dtv.month == 1 and dtv.day == 15 and dtv.hour == 14
|
|
|
|
|
|
def test_apple_workout_datetime_without_seconds_iso_not_swapped():
|
|
"""Apple Export: 2026-04-09 16:48 — ohne :SS; kein dayfirst-Fehlparser (09↔04)."""
|
|
spec = {
|
|
"type": "datetime",
|
|
"format": "yyyy-mm-dd HH:MM:SS",
|
|
"extract": "date_and_time",
|
|
}
|
|
dtv = convert_value("2026-04-09 16:48", "start_time", spec, module="activity")
|
|
assert dtv.year == 2026 and dtv.month == 4 and dtv.day == 9
|
|
assert dtv.hour == 16 and dtv.minute == 48
|
|
|
|
|
|
def test_iso_yyyy_mm_dd_dateutil_fallback_not_dayfirst_swapped():
|
|
"""Nur dateutil: ISO-Datum darf mit Default-dayfirst nicht vertauscht werden."""
|
|
spec = {"type": "date", "format": "dd.mm.yyyy", "flexible": True}
|
|
d = convert_value("2026-04-09", "d", spec)
|
|
assert d.month == 4 and d.day == 9
|
|
|
|
|
|
def test_activity_kcal_active_kj_source_unit_to_kcal():
|
|
"""Apple-DE: Aktive Energie als kJ; DB-Feld ist kcal."""
|
|
spec = {
|
|
"type": "float",
|
|
"decimal_separator": ".",
|
|
"flexible": True,
|
|
"source_unit": "kj",
|
|
}
|
|
v = convert_value("4184", "kcal_active", spec, module="activity")
|
|
assert abs(v - 1000.0) < 0.02
|
|
|
|
|
|
def test_source_unit_choices_include_custom_at_end():
|
|
opts = source_unit_choices_for_field("nutrition", "protein_g")
|
|
assert opts[-1]["id"] == "custom"
|
|
assert any(o["id"] == "mg" for o in opts)
|