- Updated the CSV import architecture to clarify the distinction between import and data layer responsibilities, as outlined in the new section of ARCHITECTURE.md. - Enhanced the build_row_after_mapping function to include module-specific context for improved data processing. - Introduced source unit options in the admin CSV template editor to facilitate user-defined conversions, improving flexibility in handling various data formats. - Added new tests to validate the handling of source units and ensure accurate conversions during CSV imports. - Updated module definitions to include unit specifications for nutritional and activity data fields, enhancing data integrity.
203 lines
6.8 KiB
Python
203 lines
6.8 KiB
Python
"""Tests für CSV-Parser Foundation (Issue #21)."""
|
|
|
|
import pytest
|
|
|
|
from csv_parser.core import (
|
|
decode_raw_bytes,
|
|
sniff_delimiter,
|
|
parse_csv_sample,
|
|
column_signature,
|
|
headers_signature_match_score,
|
|
headers_signature_rank_metrics,
|
|
get_csv_import_limits,
|
|
iter_csv_dict_rows,
|
|
)
|
|
from csv_parser.mapping_suggest import build_type_conversions_for_mapping
|
|
from csv_parser.type_converter import convert_value, build_row_after_mapping
|
|
|
|
|
|
def test_decode_bom_utf8():
|
|
raw = "\ufeffa;b;c\n1;2;3".encode("utf-8-sig")
|
|
t = decode_raw_bytes(raw)
|
|
assert not t.startswith("\ufeff")
|
|
assert "a;b;c" in t
|
|
|
|
|
|
def test_sniff_delimiter():
|
|
assert sniff_delimiter("a;b;c;d") == ";"
|
|
assert sniff_delimiter("a,b,c") == ","
|
|
|
|
|
|
def test_parse_csv_sample_header():
|
|
text = "Date;kcal\n2024-01-01;2000\n"
|
|
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
|
assert headers == ["Date", "kcal"]
|
|
assert delim == ";"
|
|
assert rows[0]["Date"] == "2024-01-01"
|
|
assert rows[0]["kcal"] == "2000"
|
|
|
|
|
|
def test_column_signature_sorted_unique():
|
|
sig = column_signature(["B", "a", "a"])
|
|
assert sig == ["a", "b"]
|
|
|
|
|
|
def test_jaccard():
|
|
s1 = column_signature(["Date", "Calories"])
|
|
s2 = column_signature(["Date", "Calories", "Fat"])
|
|
assert headers_signature_match_score(s1, s2) == pytest.approx(2 / 3)
|
|
|
|
|
|
def test_template_recall_full_when_csv_has_extra_columns():
|
|
"""Alle Template-Spalten in der CSV → Recall 1.0; Jaccard niedriger bei vielen Zusatzspalten."""
|
|
csv_sig = column_signature(
|
|
["D", "E", "F", "Extra1", "Extra2", "Extra3", "Extra4", "Extra5"]
|
|
)
|
|
tmpl_sig = column_signature(["d", "e", "f"])
|
|
m = headers_signature_rank_metrics(csv_sig, tmpl_sig)
|
|
assert m["confidence"] == 1.0
|
|
assert m["template_recall"] == 1.0
|
|
assert m["columns_matched"] == 3
|
|
assert m["columns_in_template"] == 3
|
|
assert m["jaccard"] == pytest.approx(3 / 8)
|
|
|
|
|
|
def test_get_csv_import_limits_default():
|
|
assert get_csv_import_limits(None)["max_rows_per_file"] == 50_000
|
|
|
|
|
|
def test_convert_date_and_kcal_factor():
|
|
d = convert_value("15.01.2024", "date", {"type": "date", "format": "dd.mm.yyyy"})
|
|
assert d.year == 2024 and d.month == 1 and d.day == 15
|
|
|
|
k = convert_value("8000", "kcal", {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."})
|
|
assert abs(k - 8000 * 0.239) < 0.01
|
|
|
|
|
|
def test_convert_kcal_via_source_unit_kj():
|
|
spec = {"type": "float", "source_unit": "kj", "decimal_separator": "."}
|
|
k = convert_value("4184", "kcal", spec, module="nutrition")
|
|
assert abs(k - 1000.0) < 0.05
|
|
|
|
|
|
def test_convert_protein_kg_to_g():
|
|
spec = {"type": "float", "source_unit": "kg", "decimal_separator": "."}
|
|
g = convert_value("0.1", "protein_g", spec, module="nutrition")
|
|
assert abs(g - 100.0) < 0.001
|
|
|
|
|
|
def test_build_row_source_unit_without_module_no_factor():
|
|
"""Ohne module bleibt source_unit wirkungslos (Abwärtskompatibilität)."""
|
|
spec = {"type": "float", "source_unit": "kj", "decimal_separator": "."}
|
|
k = convert_value("4184", "kcal", spec, module=None)
|
|
assert abs(k - 4184.0) < 0.01
|
|
|
|
|
|
def test_iter_csv_dict_rows_full_file():
|
|
text = "a;b\n1;2\n3;4\n"
|
|
rows = list(iter_csv_dict_rows(text, ";", has_header=True))
|
|
assert rows == [{"a": "1", "b": "2"}, {"a": "3", "b": "4"}]
|
|
|
|
|
|
def test_build_row_after_mapping():
|
|
csv_row = {"Datum": "01.01.2024", "kj": "4200"}
|
|
fm = {"Datum": "date", "kj": "kcal"}
|
|
tc = {
|
|
"date": {"type": "date", "format": "dd.mm.yyyy"},
|
|
"kcal": {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."},
|
|
}
|
|
out = build_row_after_mapping(csv_row, fm, tc, module="nutrition")
|
|
assert out["date"].month == 1
|
|
assert out["kcal"] is not None
|
|
assert abs(float(out["kcal"]) - 4200 * 0.239) < 0.02
|
|
|
|
|
|
def test_build_type_conversions_kj_header_sets_source_unit():
|
|
fm = {"kJ": "kcal", "Datum": "date"}
|
|
tc = build_type_conversions_for_mapping("nutrition", fm, None)
|
|
assert tc["kcal"].get("source_unit") == "kj"
|
|
|
|
|
|
def test_build_row_fddb_raw_header_keys_match_normalized_template():
|
|
"""FDDB: DictReader liefert deutsche Überschrift, Seed nutzt normalisierten Key."""
|
|
csv_row = {
|
|
"Datum Tag Monat Jahr Stunde Minute": "01.01.2024 8:30",
|
|
"kJ": "42000",
|
|
"Fett (g)": "50",
|
|
"KH (g)": "200",
|
|
"Protein (g)": "100",
|
|
}
|
|
fm = {
|
|
"datum_tag_monat_jahr_stunde_minute": "date",
|
|
"kj": "kcal",
|
|
"fett_g": "fat_g",
|
|
"kh_g": "carbs_g",
|
|
"protein_g": "protein_g",
|
|
}
|
|
tc = {
|
|
"date": {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
|
|
"kcal": {
|
|
"type": "float",
|
|
"source_unit": "kj",
|
|
"decimal_separator": ",",
|
|
},
|
|
"fat_g": {"type": "float", "decimal_separator": ","},
|
|
"carbs_g": {"type": "float", "decimal_separator": ","},
|
|
"protein_g": {"type": "float", "decimal_separator": ","},
|
|
}
|
|
out = build_row_after_mapping(csv_row, fm, tc, module="nutrition")
|
|
assert out["date"].year == 2024 and out["date"].month == 1 and out["date"].day == 1
|
|
assert out["kcal"] is not None and abs(float(out["kcal"]) - (42000 / 4.184)) < 0.1
|
|
|
|
|
|
def test_convert_date_ddmm_with_seconds():
|
|
d = convert_value(
|
|
"15.01.2024 14:30:00",
|
|
"date",
|
|
{"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
|
|
)
|
|
assert d.month == 1 and d.day == 15
|
|
|
|
|
|
def test_float_decimal_separator_auto_eu_us():
|
|
assert abs(convert_value("1.234,56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
|
|
assert abs(convert_value("1,234.56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
|
|
assert abs(convert_value("1234,5", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.5) < 1e-9
|
|
|
|
|
|
def test_float_flexible_falls_back_to_auto():
|
|
spec = {"type": "float", "decimal_separator": ",", "flexible": True}
|
|
assert abs(convert_value("1234.56", "x", spec) - 1234.56) < 1e-9
|
|
|
|
|
|
def test_date_flexible_iso_while_primary_ddmm():
|
|
spec = {
|
|
"type": "date",
|
|
"format": "dd.mm.yyyy",
|
|
"flexible": True,
|
|
"extract": "date_only",
|
|
}
|
|
d1 = convert_value("2024-03-15", "d", spec)
|
|
d2 = convert_value("15.03.2024", "d", spec)
|
|
assert d1 == d2
|
|
|
|
|
|
def test_date_extra_formats_without_days_in_name():
|
|
spec = {
|
|
"type": "date",
|
|
"format": "yyyy-mm-dd",
|
|
"formats": ["%d.%m.%Y"],
|
|
"extract": "date_only",
|
|
}
|
|
assert convert_value("08.04.2026", "d", spec).day == 8
|
|
|
|
|
|
def test_int_flexible_thousands():
|
|
assert convert_value("1.234", "n", {"type": "int", "flexible": True}) == 1234
|
|
|
|
|
|
def test_datetime_flexible():
|
|
spec = {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True}
|
|
dtv = convert_value("15.01.2024 14:30:00", "t", spec)
|
|
assert dtv.month == 1 and dtv.day == 15 and dtv.hour == 14
|