feat: Enhance CSV header normalization and mapping for activity data
- Introduced a new utility function `canonical_csv_header_label` to standardize CSV header labels, improving consistency in field mapping. - Updated the `_lookup_db_field` function to support prefix matching for longer manual keys, enhancing the accuracy of field resolution. - Added tests to validate handling of non-breaking space characters in CSV headers and ensure correct mapping to normalized keys, improving robustness of CSV parsing.
This commit is contained in:
parent
c570e67a09
commit
e4e8c70cd2
|
|
@ -57,6 +57,18 @@ def _split_first_lines(text: str, max_lines: int = 5) -> List[str]:
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def canonical_csv_header_label(name: str | None) -> str:
|
||||||
|
"""
|
||||||
|
Einheitlicher Spalten-Key für Analyse (Vorlage/Dialog), Import und Signatur.
|
||||||
|
BOM und NBSP (häufig in Excel/Apple-Exporten) werden vereinheitlicht, damit
|
||||||
|
field_mappings exakt zu DictReader-Zeilen passt.
|
||||||
|
"""
|
||||||
|
if name is None:
|
||||||
|
return ""
|
||||||
|
s = str(name).replace("\ufeff", "").replace("\u00a0", " ").strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def parse_csv_sample(
|
def parse_csv_sample(
|
||||||
text: str,
|
text: str,
|
||||||
delimiter: str | None = None,
|
delimiter: str | None = None,
|
||||||
|
|
@ -85,7 +97,7 @@ def parse_csv_sample(
|
||||||
return [], [], delim
|
return [], [], delim
|
||||||
|
|
||||||
if has_header:
|
if has_header:
|
||||||
headers = [h.strip() for h in rows_raw[0]]
|
headers = [canonical_csv_header_label(h) for h in rows_raw[0]]
|
||||||
data = rows_raw[1 : 1 + max_data_rows]
|
data = rows_raw[1 : 1 + max_data_rows]
|
||||||
else:
|
else:
|
||||||
n = len(rows_raw[0])
|
n = len(rows_raw[0])
|
||||||
|
|
@ -103,7 +115,7 @@ def parse_csv_sample(
|
||||||
|
|
||||||
|
|
||||||
def normalize_header_for_signature(name: str) -> str:
|
def normalize_header_for_signature(name: str) -> str:
|
||||||
s = name.strip().lower()
|
s = canonical_csv_header_label(name).lower()
|
||||||
s = re.sub(r"\s+", "_", s)
|
s = re.sub(r"\s+", "_", s)
|
||||||
s = re.sub(r"[^a-z0-9_äöüß().%-]+", "_", s)
|
s = re.sub(r"[^a-z0-9_äöüß().%-]+", "_", s)
|
||||||
return s.strip("_")
|
return s.strip("_")
|
||||||
|
|
@ -111,7 +123,9 @@ def normalize_header_for_signature(name: str) -> str:
|
||||||
|
|
||||||
def column_signature(headers: List[str]) -> List[str]:
|
def column_signature(headers: List[str]) -> List[str]:
|
||||||
"""Sortierte normalisierte Spaltennamen für Signatur-Vergleich."""
|
"""Sortierte normalisierte Spaltennamen für Signatur-Vergleich."""
|
||||||
return sorted({normalize_header_for_signature(h) for h in headers if h is not None and str(h).strip()})
|
return sorted(
|
||||||
|
{normalize_header_for_signature(h) for h in headers if h is not None and canonical_csv_header_label(str(h))}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -> float:
|
def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -> float:
|
||||||
|
|
@ -178,12 +192,6 @@ def get_csv_import_limits(conn_row: dict | None) -> dict[str, int]:
|
||||||
return defaults
|
return defaults
|
||||||
|
|
||||||
|
|
||||||
def _strip_header_key(k: str | None) -> str:
|
|
||||||
if k is None:
|
|
||||||
return ""
|
|
||||||
return str(k).strip().removeprefix("\ufeff")
|
|
||||||
|
|
||||||
|
|
||||||
def iter_csv_dict_rows(
|
def iter_csv_dict_rows(
|
||||||
text: str,
|
text: str,
|
||||||
delimiter: str,
|
delimiter: str,
|
||||||
|
|
@ -205,4 +213,8 @@ def iter_csv_dict_rows(
|
||||||
continue
|
continue
|
||||||
if not any(v and str(v).strip() for v in row.values()):
|
if not any(v and str(v).strip() for v in row.values()):
|
||||||
continue
|
continue
|
||||||
yield {_strip_header_key(k): (v or "").strip() for k, v in row.items() if _strip_header_key(k)}
|
yield {
|
||||||
|
canonical_csv_header_label(k): (v or "").strip()
|
||||||
|
for k, v in row.items()
|
||||||
|
if canonical_csv_header_label(k)
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ from typing import Any, Mapping, Sequence
|
||||||
|
|
||||||
from dateutil import parser as dateutil_parser
|
from dateutil import parser as dateutil_parser
|
||||||
|
|
||||||
from csv_parser.core import normalize_header_for_signature
|
from csv_parser.core import canonical_csv_header_label, normalize_header_for_signature
|
||||||
from csv_parser.field_units import factor_source_to_canonical
|
from csv_parser.field_units import factor_source_to_canonical
|
||||||
|
|
||||||
# Alias → strptime (JSON in Kleinbuchstaben)
|
# Alias → strptime (JSON in Kleinbuchstaben)
|
||||||
|
|
@ -477,7 +477,12 @@ def _lookup_db_field(csv_col: str, field_mappings: Mapping[str, str]) -> str | N
|
||||||
CSV-Spaltennamen können Roh-Header sein; Vorlagen-Schlüssel oft normalisiert
|
CSV-Spaltennamen können Roh-Header sein; Vorlagen-Schlüssel oft normalisiert
|
||||||
(wie column_signature). Exakter Treffer, dann Schlüssel nach Normalisierung,
|
(wie column_signature). Exakter Treffer, dann Schlüssel nach Normalisierung,
|
||||||
dann Abgleich aller Vorlagen-Keys über deren Normalform.
|
dann Abgleich aller Vorlagen-Keys über deren Normalform.
|
||||||
|
|
||||||
|
Zusätzlich: Präfix-Treffer für lange manuelle Keys (z. B. Apple
|
||||||
|
„Aufgestiegene Höhe (m)“ → ``aufgestiegene_höhe_(m)`` vs. Mapping
|
||||||
|
„aufgestiegene Höhe“ → ``aufgestiegene_höhe``) — gewinnt der längste passende Key.
|
||||||
"""
|
"""
|
||||||
|
csv_col = canonical_csv_header_label(csv_col)
|
||||||
v = field_mappings.get(csv_col)
|
v = field_mappings.get(csv_col)
|
||||||
if v:
|
if v:
|
||||||
return v if v not in ("-", "_skip") else None
|
return v if v not in ("-", "_skip") else None
|
||||||
|
|
@ -488,6 +493,27 @@ def _lookup_db_field(csv_col: str, field_mappings: Mapping[str, str]) -> str | N
|
||||||
for k, fv in field_mappings.items():
|
for k, fv in field_mappings.items():
|
||||||
if normalize_header_for_signature(str(k)) == norm:
|
if normalize_header_for_signature(str(k)) == norm:
|
||||||
return fv if fv not in ("-", "_skip") else None
|
return fv if fv not in ("-", "_skip") else None
|
||||||
|
|
||||||
|
# Präfix-Match (min. Länge gegen false positives wie „datum“ → „datum_xyz“)
|
||||||
|
best_fv: str | None = None
|
||||||
|
best_nk_len = 0
|
||||||
|
min_prefix = 10
|
||||||
|
for k, fv in field_mappings.items():
|
||||||
|
if not fv or fv in ("-", "_skip"):
|
||||||
|
continue
|
||||||
|
nk = normalize_header_for_signature(str(k))
|
||||||
|
if len(nk) < min_prefix or len(nk) >= len(norm):
|
||||||
|
continue
|
||||||
|
if not norm.startswith(nk):
|
||||||
|
continue
|
||||||
|
boundary = norm[len(nk) : len(nk) + 1]
|
||||||
|
if boundary not in ("", "_", "("):
|
||||||
|
continue
|
||||||
|
if len(nk) > best_nk_len:
|
||||||
|
best_nk_len = len(nk)
|
||||||
|
best_fv = fv
|
||||||
|
if best_fv:
|
||||||
|
return best_fv
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,29 @@ def test_parse_csv_sample_header():
|
||||||
assert rows[0]["kcal"] == "2000"
|
assert rows[0]["kcal"] == "2000"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_csv_sample_nbsp_in_header_matches_normal_space_key():
|
||||||
|
"""Excel/Apple: NBSP (U+00A0) im Spaltennamen — gleicher Key wie normales Leerzeichen."""
|
||||||
|
text = "Aufgestiegene\u00a0Höhe (m);Wert\n12;3\n"
|
||||||
|
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
||||||
|
assert headers == ["Aufgestiegene Höhe (m)", "Wert"]
|
||||||
|
assert rows[0]["Aufgestiegene Höhe (m)"] == "12"
|
||||||
|
|
||||||
|
|
||||||
|
def test_iter_csv_dict_rows_nbsp_header_canonical():
|
||||||
|
text = "col\u00a0one;b\n1;2\n"
|
||||||
|
rows = list(iter_csv_dict_rows(text, ";", has_header=True))
|
||||||
|
assert rows == [{"col one": "1", "b": "2"}]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_row_field_mapping_space_vs_nbsp_in_csv_header():
|
||||||
|
"""Vorlage (Dialog) mit normalem Leerzeichen, CSV mit NBSP — Zuordnung muss greifen."""
|
||||||
|
csv_row = {"Aufgestiegene\u00a0Höhe (m)": "10"}
|
||||||
|
fm = {"Aufgestiegene Höhe (m)": "stola"}
|
||||||
|
tc = {"stola": {"type": "float", "decimal_separator": ".", "flexible": True}}
|
||||||
|
out = build_row_after_mapping(csv_row, fm, tc, module="activity")
|
||||||
|
assert out.get("stola") == 10.0
|
||||||
|
|
||||||
|
|
||||||
def test_column_signature_sorted_unique():
|
def test_column_signature_sorted_unique():
|
||||||
sig = column_signature(["B", "a", "a"])
|
sig = column_signature(["B", "a", "a"])
|
||||||
assert sig == ["a", "b"]
|
assert sig == ["a", "b"]
|
||||||
|
|
@ -183,6 +206,16 @@ def test_build_row_fddb_raw_header_keys_match_normalized_template():
|
||||||
assert out["kcal"] is not None and abs(float(out["kcal"]) - (42000 / 4.184)) < 0.1
|
assert out["kcal"] is not None and abs(float(out["kcal"]) - (42000 / 4.184)) < 0.1
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_row_apple_workout_elevation_header_prefix_matches_shorter_mapping_key():
|
||||||
|
"""Apple Workouts: „Aufgestiegene Höhe (m)“ normalisiert anders als manuell „aufgestiegene Höhe“."""
|
||||||
|
csv_row = {"Aufgestiegene Höhe (m)": "47.13", "Workout Type": "Outdoor Spaziergang"}
|
||||||
|
fm = {"aufgestiegene Höhe": "stola", "Workout Type": "activity_type"}
|
||||||
|
tc = {"stola": {"type": "string"}}
|
||||||
|
out = build_row_after_mapping(csv_row, fm, tc, module="activity")
|
||||||
|
assert out.get("stola") == "47.13"
|
||||||
|
assert out.get("activity_type") == "Outdoor Spaziergang"
|
||||||
|
|
||||||
|
|
||||||
def test_convert_date_ddmm_with_seconds():
|
def test_convert_date_ddmm_with_seconds():
|
||||||
d = convert_value(
|
d = convert_value(
|
||||||
"15.01.2024 14:30:00",
|
"15.01.2024 14:30:00",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user