- Added new functions for calculating header signature recall and ranking metrics, improving the analysis of CSV templates. - Updated existing CSV analysis endpoints to utilize the new ranking metrics, enhancing the accuracy of template matching. - Refactored related code to replace Jaccard score calculations with the new metrics, providing a more comprehensive evaluation of CSV structure. - Improved documentation for new functions to clarify their purpose and usage in the context of CSV template analysis.
175 lines
5.6 KiB
Python
175 lines
5.6 KiB
Python
"""Tests für CSV-Parser Foundation (Issue #21)."""
|
|
|
|
import pytest
|
|
|
|
from csv_parser.core import (
|
|
decode_raw_bytes,
|
|
sniff_delimiter,
|
|
parse_csv_sample,
|
|
column_signature,
|
|
headers_signature_match_score,
|
|
headers_signature_rank_metrics,
|
|
get_csv_import_limits,
|
|
iter_csv_dict_rows,
|
|
)
|
|
from csv_parser.type_converter import convert_value, build_row_after_mapping
|
|
|
|
|
|
def test_decode_bom_utf8():
|
|
raw = "\ufeffa;b;c\n1;2;3".encode("utf-8-sig")
|
|
t = decode_raw_bytes(raw)
|
|
assert not t.startswith("\ufeff")
|
|
assert "a;b;c" in t
|
|
|
|
|
|
def test_sniff_delimiter():
|
|
assert sniff_delimiter("a;b;c;d") == ";"
|
|
assert sniff_delimiter("a,b,c") == ","
|
|
|
|
|
|
def test_parse_csv_sample_header():
|
|
text = "Date;kcal\n2024-01-01;2000\n"
|
|
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
|
assert headers == ["Date", "kcal"]
|
|
assert delim == ";"
|
|
assert rows[0]["Date"] == "2024-01-01"
|
|
assert rows[0]["kcal"] == "2000"
|
|
|
|
|
|
def test_column_signature_sorted_unique():
|
|
sig = column_signature(["B", "a", "a"])
|
|
assert sig == ["a", "b"]
|
|
|
|
|
|
def test_jaccard():
|
|
s1 = column_signature(["Date", "Calories"])
|
|
s2 = column_signature(["Date", "Calories", "Fat"])
|
|
assert headers_signature_match_score(s1, s2) == pytest.approx(2 / 3)
|
|
|
|
|
|
def test_template_recall_full_when_csv_has_extra_columns():
|
|
"""Alle Template-Spalten in der CSV → Recall 1.0; Jaccard niedriger bei vielen Zusatzspalten."""
|
|
csv_sig = column_signature(
|
|
["D", "E", "F", "Extra1", "Extra2", "Extra3", "Extra4", "Extra5"]
|
|
)
|
|
tmpl_sig = column_signature(["d", "e", "f"])
|
|
m = headers_signature_rank_metrics(csv_sig, tmpl_sig)
|
|
assert m["confidence"] == 1.0
|
|
assert m["template_recall"] == 1.0
|
|
assert m["columns_matched"] == 3
|
|
assert m["columns_in_template"] == 3
|
|
assert m["jaccard"] == pytest.approx(3 / 8)
|
|
|
|
|
|
def test_get_csv_import_limits_default():
|
|
assert get_csv_import_limits(None)["max_rows_per_file"] == 50_000
|
|
|
|
|
|
def test_convert_date_and_kcal_factor():
|
|
d = convert_value("15.01.2024", "date", {"type": "date", "format": "dd.mm.yyyy"})
|
|
assert d.year == 2024 and d.month == 1 and d.day == 15
|
|
|
|
k = convert_value("8000", "kcal", {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."})
|
|
assert abs(k - 8000 * 0.239) < 0.01
|
|
|
|
|
|
def test_iter_csv_dict_rows_full_file():
|
|
text = "a;b\n1;2\n3;4\n"
|
|
rows = list(iter_csv_dict_rows(text, ";", has_header=True))
|
|
assert rows == [{"a": "1", "b": "2"}, {"a": "3", "b": "4"}]
|
|
|
|
|
|
def test_build_row_after_mapping():
|
|
csv_row = {"Datum": "01.01.2024", "kj": "4200"}
|
|
fm = {"Datum": "date", "kj": "kcal"}
|
|
tc = {
|
|
"date": {"type": "date", "format": "dd.mm.yyyy"},
|
|
"kcal": {"type": "float", "conversion_factor": 0.239, "decimal_separator": "."},
|
|
}
|
|
out = build_row_after_mapping(csv_row, fm, tc)
|
|
assert out["date"].month == 1
|
|
assert out["kcal"] is not None
|
|
|
|
|
|
def test_build_row_fddb_raw_header_keys_match_normalized_template():
|
|
"""FDDB: DictReader liefert deutsche Überschrift, Seed nutzt normalisierten Key."""
|
|
csv_row = {
|
|
"Datum Tag Monat Jahr Stunde Minute": "01.01.2024 8:30",
|
|
"kJ": "42000",
|
|
"Fett (g)": "50",
|
|
"KH (g)": "200",
|
|
"Protein (g)": "100",
|
|
}
|
|
fm = {
|
|
"datum_tag_monat_jahr_stunde_minute": "date",
|
|
"kj": "kcal",
|
|
"fett_g": "fat_g",
|
|
"kh_g": "carbs_g",
|
|
"protein_g": "protein_g",
|
|
}
|
|
tc = {
|
|
"date": {"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
|
|
"kcal": {
|
|
"type": "float",
|
|
"conversion_factor": 0.239,
|
|
"decimal_separator": ",",
|
|
},
|
|
"fat_g": {"type": "float", "decimal_separator": ","},
|
|
"carbs_g": {"type": "float", "decimal_separator": ","},
|
|
"protein_g": {"type": "float", "decimal_separator": ","},
|
|
}
|
|
out = build_row_after_mapping(csv_row, fm, tc)
|
|
assert out["date"].year == 2024 and out["date"].month == 1 and out["date"].day == 1
|
|
|
|
|
|
def test_convert_date_ddmm_with_seconds():
|
|
d = convert_value(
|
|
"15.01.2024 14:30:00",
|
|
"date",
|
|
{"type": "date", "format": "dd.mm.yyyy HH:MM", "extract": "date_only"},
|
|
)
|
|
assert d.month == 1 and d.day == 15
|
|
|
|
|
|
def test_float_decimal_separator_auto_eu_us():
|
|
assert abs(convert_value("1.234,56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
|
|
assert abs(convert_value("1,234.56", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.56) < 1e-9
|
|
assert abs(convert_value("1234,5", "x", {"type": "float", "decimal_separator": "auto"}) - 1234.5) < 1e-9
|
|
|
|
|
|
def test_float_flexible_falls_back_to_auto():
|
|
spec = {"type": "float", "decimal_separator": ",", "flexible": True}
|
|
assert abs(convert_value("1234.56", "x", spec) - 1234.56) < 1e-9
|
|
|
|
|
|
def test_date_flexible_iso_while_primary_ddmm():
|
|
spec = {
|
|
"type": "date",
|
|
"format": "dd.mm.yyyy",
|
|
"flexible": True,
|
|
"extract": "date_only",
|
|
}
|
|
d1 = convert_value("2024-03-15", "d", spec)
|
|
d2 = convert_value("15.03.2024", "d", spec)
|
|
assert d1 == d2
|
|
|
|
|
|
def test_date_extra_formats_without_days_in_name():
|
|
spec = {
|
|
"type": "date",
|
|
"format": "yyyy-mm-dd",
|
|
"formats": ["%d.%m.%Y"],
|
|
"extract": "date_only",
|
|
}
|
|
assert convert_value("08.04.2026", "d", spec).day == 8
|
|
|
|
|
|
def test_int_flexible_thousands():
|
|
assert convert_value("1.234", "n", {"type": "int", "flexible": True}) == 1234
|
|
|
|
|
|
def test_datetime_flexible():
|
|
spec = {"type": "datetime", "format": "yyyy-mm-dd HH:MM:SS", "flexible": True}
|
|
dtv = convert_value("15.01.2024 14:30:00", "t", spec)
|
|
assert dtv.month == 1 and dtv.day == 15 and dtv.hour == 14
|