From 90a27846ca9e4f7a2dd0ac2fb39173c761d2c49f Mon Sep 17 00:00:00 2001 From: Lars Date: Sun, 12 Apr 2026 07:28:24 +0200 Subject: [PATCH] feat: Improve float parsing logic for enhanced accuracy in numeric conversions - Updated the `_parse_float_auto` function in `type_converter.py` to better handle various decimal and thousand separators, particularly for cases with long decimal parts from sources like Apple Health. - Enhanced the logic for splitting and processing numeric strings to ensure correct interpretation of values, including edge cases with multiple separators. - Added handling for cases where numeric strings may contain both commas and periods, improving overall robustness in float parsing. These changes enhance the accuracy of numeric conversions, ensuring more reliable data processing across the application. --- backend/csv_parser/type_converter.py | 51 ++++++++++++++++++++++----- backend/tests/test_csv_parser_core.py | 19 ++++++++++ 2 files changed, 62 insertions(+), 8 deletions(-) diff --git a/backend/csv_parser/type_converter.py b/backend/csv_parser/type_converter.py index a29e5c7..8d59e5e 100644 --- a/backend/csv_parser/type_converter.py +++ b/backend/csv_parser/type_converter.py @@ -66,7 +66,13 @@ def _parse_float_auto(s: str) -> float: """ Heuristik ohne festes Locale: Punkt/Komma als Tausender vs. Dezimal, basierend auf der letzten erkannten Trennstelle und Gruppierung. + + Apple Health u. a. liefern berechnete Mittelwerte mit vielen Nachkommastellen + (z. B. «96.874937…») und Energie als «596.668904…» — dabei ist der Punkt + immer Dezimaltrenner. Früher wurden lange Nachkommateile fälschlich so + behandelt, dass der Punkt entfernt wurde (Tausender-Heuristik). """ + raw = s s = _normalize_num_token(s) if not s or s in ("-", "—", "–"): raise ValueError("leer") @@ -90,18 +96,35 @@ def _parse_float_auto(s: str) -> float: s = s.replace(",", "") elif last_comma >= 0: parts = s.split(",") - if len(parts) == 2 and len(parts[1]) <= 2: - s = parts[0].replace(".", "") + "." + parts[1] - elif len(parts) == 2 and len(parts[1]) == 3 and len(parts[0]) <= 3: - s = parts[0] + parts[1] + if len(parts) == 2: + left, right = parts[0], parts[1] + if not right: + raise ValueError("leer") + left_digits = left.replace(".", "") + # Langer Nachkommateil → Dezimalkomma; «1.234,56»-Fälle oben mit Punkt+Komma + if len(right) > 3 or len(right) <= 2: + s = left_digits + "." + right.replace(".", "") + elif len(right) == 3 and len(left_digits) <= 3: + s = left_digits + right + else: + s = left_digits + "." + right.replace(".", "") else: s = s.replace(",", "") elif last_dot >= 0: parts = s.split(".") - if len(parts) == 2 and len(parts[1]) <= 2: - s = parts[0].replace(",", "") + "." + parts[1] - elif len(parts) == 2 and len(parts[1]) == 3 and len(parts[0]) <= 3: - s = parts[0] + parts[1] + if len(parts) == 2: + left, right = parts[0], parts[1] + if not right: + raise ValueError("leer") + left_digits = left.replace(",", "") + # Genau ein Punkt: viele Nachkommastellen → Apple/US-Dezimalpunkt (nicht „.“ streichen) + if len(right) > 3 or len(right) <= 2: + s = left_digits + "." + right + elif len(right) == 3: + if len(left_digits) == 1 and left_digits != "0" and left_digits.isdigit(): + s = left_digits + right + else: + s = left_digits + "." + right elif len(parts) > 2: if len(parts[-1]) <= 2: s = "".join(parts[:-1]) + "." + parts[-1] @@ -345,6 +368,18 @@ def _parse_int(raw: str, spec: Mapping[str, Any]) -> int: raise ValueError("leer") v = int(digits) return -v if neg else v + # Ohne flexible: «108.0» / «96,8» trotzdem als Zahl mit Nachkommastellen + s2 = _normalize_num_token(s) + if "," in s2 or "." in s2: + dec = spec.get("decimal_separator", ".") + try: + if dec in (None, "auto"): + fv = _parse_float_auto(s2) + else: + fv = _parse_float(raw, str(dec)) + return int(round(fv)) + except (ValueError, InvalidOperation): + pass s = re.sub(r"[^\d-]", "", s) if not s: raise ValueError("leer") diff --git a/backend/tests/test_csv_parser_core.py b/backend/tests/test_csv_parser_core.py index e9b962d..2040395 100644 --- a/backend/tests/test_csv_parser_core.py +++ b/backend/tests/test_csv_parser_core.py @@ -81,6 +81,25 @@ def test_convert_kcal_via_source_unit_kj(): assert abs(k - 1000.0) < 0.05 +def test_apple_health_long_decimal_dot_preserved(): + """Apple: Mittel-HF u. Energie mit vielen Nachkommastellen; Punkt ist Dezimaltrenner.""" + hr_spec = {"type": "int", "flexible": True} + r = convert_value("96.8749374730629", "hr_avg", hr_spec, module="activity") + assert r == 97 + + rest_spec = {"type": "float", "decimal_separator": ".", "flexible": True, "source_unit": "kj"} + kcal = convert_value("596.6689047323086", "kcal_resting", rest_spec, module="activity") + assert 140.0 < kcal < 145.0 + + +def test_parse_float_auto_us_thousands_comma(): + """«12,345» ohne Dezimalpunkt weiter als Tausender möglich.""" + v = convert_value("12345", "x", {"type": "float", "decimal_separator": "auto"}) + assert v == 12345.0 + v2 = convert_value("12,345", "x", {"type": "float", "decimal_separator": "auto"}) + assert abs(v2 - 12345.0) < 0.01 + + def test_convert_protein_kg_to_g(): spec = {"type": "float", "source_unit": "kg", "decimal_separator": "."} g = convert_value("0.1", "protein_g", spec, module="nutrition")