diff --git a/backend/csv_parser/core.py b/backend/csv_parser/core.py index ed449b8..eb23a9f 100644 --- a/backend/csv_parser/core.py +++ b/backend/csv_parser/core.py @@ -57,6 +57,18 @@ def _split_first_lines(text: str, max_lines: int = 5) -> List[str]: return lines +def canonical_csv_header_label(name: str | None) -> str: + """ + Einheitlicher Spalten-Key für Analyse (Vorlage/Dialog), Import und Signatur. + BOM und NBSP (häufig in Excel/Apple-Exporten) werden vereinheitlicht, damit + field_mappings exakt zu DictReader-Zeilen passt. + """ + if name is None: + return "" + s = str(name).replace("\ufeff", "").replace("\u00a0", " ").strip() + return s + + def parse_csv_sample( text: str, delimiter: str | None = None, @@ -85,7 +97,7 @@ def parse_csv_sample( return [], [], delim if has_header: - headers = [h.strip() for h in rows_raw[0]] + headers = [canonical_csv_header_label(h) for h in rows_raw[0]] data = rows_raw[1 : 1 + max_data_rows] else: n = len(rows_raw[0]) @@ -103,7 +115,7 @@ def parse_csv_sample( def normalize_header_for_signature(name: str) -> str: - s = name.strip().lower() + s = canonical_csv_header_label(name).lower() s = re.sub(r"\s+", "_", s) s = re.sub(r"[^a-z0-9_äöüß().%-]+", "_", s) return s.strip("_") @@ -111,7 +123,9 @@ def normalize_header_for_signature(name: str) -> str: def column_signature(headers: List[str]) -> List[str]: """Sortierte normalisierte Spaltennamen für Signatur-Vergleich.""" - return sorted({normalize_header_for_signature(h) for h in headers if h is not None and str(h).strip()}) + return sorted( + {normalize_header_for_signature(h) for h in headers if h is not None and canonical_csv_header_label(str(h))} + ) def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -> float: @@ -178,12 +192,6 @@ def get_csv_import_limits(conn_row: dict | None) -> dict[str, int]: return defaults -def _strip_header_key(k: str | None) -> str: - if k is None: - return "" - return str(k).strip().removeprefix("\ufeff") - - def iter_csv_dict_rows( text: str, delimiter: str, @@ -205,4 +213,8 @@ def iter_csv_dict_rows( continue if not any(v and str(v).strip() for v in row.values()): continue - yield {_strip_header_key(k): (v or "").strip() for k, v in row.items() if _strip_header_key(k)} + yield { + canonical_csv_header_label(k): (v or "").strip() + for k, v in row.items() + if canonical_csv_header_label(k) + } diff --git a/backend/csv_parser/type_converter.py b/backend/csv_parser/type_converter.py index 8d59e5e..5a55832 100644 --- a/backend/csv_parser/type_converter.py +++ b/backend/csv_parser/type_converter.py @@ -14,7 +14,7 @@ from typing import Any, Mapping, Sequence from dateutil import parser as dateutil_parser -from csv_parser.core import normalize_header_for_signature +from csv_parser.core import canonical_csv_header_label, normalize_header_for_signature from csv_parser.field_units import factor_source_to_canonical # Alias → strptime (JSON in Kleinbuchstaben) @@ -477,7 +477,12 @@ def _lookup_db_field(csv_col: str, field_mappings: Mapping[str, str]) -> str | N CSV-Spaltennamen können Roh-Header sein; Vorlagen-Schlüssel oft normalisiert (wie column_signature). Exakter Treffer, dann Schlüssel nach Normalisierung, dann Abgleich aller Vorlagen-Keys über deren Normalform. + + Zusätzlich: Präfix-Treffer für lange manuelle Keys (z. B. Apple + „Aufgestiegene Höhe (m)“ → ``aufgestiegene_höhe_(m)`` vs. Mapping + „aufgestiegene Höhe“ → ``aufgestiegene_höhe``) — gewinnt der längste passende Key. """ + csv_col = canonical_csv_header_label(csv_col) v = field_mappings.get(csv_col) if v: return v if v not in ("-", "_skip") else None @@ -488,6 +493,27 @@ def _lookup_db_field(csv_col: str, field_mappings: Mapping[str, str]) -> str | N for k, fv in field_mappings.items(): if normalize_header_for_signature(str(k)) == norm: return fv if fv not in ("-", "_skip") else None + + # Präfix-Match (min. Länge gegen false positives wie „datum“ → „datum_xyz“) + best_fv: str | None = None + best_nk_len = 0 + min_prefix = 10 + for k, fv in field_mappings.items(): + if not fv or fv in ("-", "_skip"): + continue + nk = normalize_header_for_signature(str(k)) + if len(nk) < min_prefix or len(nk) >= len(norm): + continue + if not norm.startswith(nk): + continue + boundary = norm[len(nk) : len(nk) + 1] + if boundary not in ("", "_", "("): + continue + if len(nk) > best_nk_len: + best_nk_len = len(nk) + best_fv = fv + if best_fv: + return best_fv return None diff --git a/backend/tests/test_csv_parser_core.py b/backend/tests/test_csv_parser_core.py index 2040395..3e27673 100644 --- a/backend/tests/test_csv_parser_core.py +++ b/backend/tests/test_csv_parser_core.py @@ -38,6 +38,29 @@ def test_parse_csv_sample_header(): assert rows[0]["kcal"] == "2000" +def test_parse_csv_sample_nbsp_in_header_matches_normal_space_key(): + """Excel/Apple: NBSP (U+00A0) im Spaltennamen — gleicher Key wie normales Leerzeichen.""" + text = "Aufgestiegene\u00a0Höhe (m);Wert\n12;3\n" + headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3) + assert headers == ["Aufgestiegene Höhe (m)", "Wert"] + assert rows[0]["Aufgestiegene Höhe (m)"] == "12" + + +def test_iter_csv_dict_rows_nbsp_header_canonical(): + text = "col\u00a0one;b\n1;2\n" + rows = list(iter_csv_dict_rows(text, ";", has_header=True)) + assert rows == [{"col one": "1", "b": "2"}] + + +def test_build_row_field_mapping_space_vs_nbsp_in_csv_header(): + """Vorlage (Dialog) mit normalem Leerzeichen, CSV mit NBSP — Zuordnung muss greifen.""" + csv_row = {"Aufgestiegene\u00a0Höhe (m)": "10"} + fm = {"Aufgestiegene Höhe (m)": "stola"} + tc = {"stola": {"type": "float", "decimal_separator": ".", "flexible": True}} + out = build_row_after_mapping(csv_row, fm, tc, module="activity") + assert out.get("stola") == 10.0 + + def test_column_signature_sorted_unique(): sig = column_signature(["B", "a", "a"]) assert sig == ["a", "b"] @@ -183,6 +206,16 @@ def test_build_row_fddb_raw_header_keys_match_normalized_template(): assert out["kcal"] is not None and abs(float(out["kcal"]) - (42000 / 4.184)) < 0.1 +def test_build_row_apple_workout_elevation_header_prefix_matches_shorter_mapping_key(): + """Apple Workouts: „Aufgestiegene Höhe (m)“ normalisiert anders als manuell „aufgestiegene Höhe“.""" + csv_row = {"Aufgestiegene Höhe (m)": "47.13", "Workout Type": "Outdoor Spaziergang"} + fm = {"aufgestiegene Höhe": "stola", "Workout Type": "activity_type"} + tc = {"stola": {"type": "string"}} + out = build_row_after_mapping(csv_row, fm, tc, module="activity") + assert out.get("stola") == "47.13" + assert out.get("activity_type") == "Outdoor Spaziergang" + + def test_convert_date_ddmm_with_seconds(): d = convert_value( "15.01.2024 14:30:00",