From 7226e04e9c0146f6d6f25c185b9c6f336bd1ff1c Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 18 Apr 2026 10:12:33 +0200 Subject: [PATCH] feat: implement effective CSV delimiter resolution for imports - Added `resolve_effective_csv_delimiter` function to determine the correct delimiter based on the uploaded file and template. - Updated CSV import logic to utilize the new delimiter resolution method, ensuring accurate parsing of CSV files with varying delimiters. - Enhanced documentation to reflect changes in delimiter handling. - Added unit tests for the new delimiter resolution functionality. --- .../UNIVERSAL_CSV_IMPORT_AGENT_GUIDE.md | 1 + backend/csv_parser/core.py | 40 +++++++++++++++++++ backend/csv_parser/executor.py | 5 ++- backend/routers/csv_import.py | 5 ++- backend/tests/test_csv_parser_core.py | 15 +++++++ 5 files changed, 63 insertions(+), 3 deletions(-) diff --git a/.claude/docs/technical/UNIVERSAL_CSV_IMPORT_AGENT_GUIDE.md b/.claude/docs/technical/UNIVERSAL_CSV_IMPORT_AGENT_GUIDE.md index 5bdcea0..e47174c 100644 --- a/.claude/docs/technical/UNIVERSAL_CSV_IMPORT_AGENT_GUIDE.md +++ b/.claude/docs/technical/UNIVERSAL_CSV_IMPORT_AGENT_GUIDE.md @@ -18,6 +18,7 @@ Dieses Dokument ist **normativ für Agenten**, die ein neues Import-Zielmodul an | Admin-Systemvorlagen | `backend/routers/admin_csv_templates.py` | | Nutzer-Import (Profil-Mappings) | `backend/routers/csv_import.py` | | Vorlagen-Validierung (strukturell + Sample) | `backend/csv_parser/template_validator.py` (`validate_csv_template`) | +| Effektives Listentrennzeichen | `backend/csv_parser/core.py` (`resolve_effective_csv_delimiter`) — Datei kann `;` (z. B. Apple DE) haben, Vorlage `,` (EN); Import/Diagnose **nicht** nur das gespeicherte Trennzeichen blind nutzen. | **Single Source of Truth** für erlaubte Zielfelder, Typen und Duplikat-Keys ist **`module_registry.py`**. Keine parallele Feldliste in Routern duplizieren. diff --git a/backend/csv_parser/core.py b/backend/csv_parser/core.py index eb23a9f..444c9a1 100644 --- a/backend/csv_parser/core.py +++ b/backend/csv_parser/core.py @@ -47,6 +47,46 @@ def sniff_delimiter(sample_line: str) -> str: return best +def _csv_field_count(line: str, delimiter: str) -> int: + """Anzahl Felder in einer Zeile (csv.reader, berücksichtigt Anführungszeichen).""" + if not line or not line.strip(): + return 0 + try: + row = next(csv.reader(io.StringIO(line), delimiter=delimiter)) + except StopIteration: + return 0 + return len(row) + + +def resolve_effective_csv_delimiter(text: str, template_delimiter: str | None = None) -> str: + """ + Trennzeichen für die hochgeladene Datei wählen. Gespeicherte Vorlagen haben oft «,» + (Apple EN), tatsächliche Exporte je nach Region «;» (Apple DE / Excel) — mit falschem + Zeichen wird die Kopfzeile zu **einer** Spalte und das Mapping bricht vollständig. + """ + tpl = (template_delimiter or "").strip() + if tpl not in _DEFAULT_DELIMS: + tpl = None + + lines = _split_first_lines(text, max_lines=5) + if not lines: + return tpl or "," + + header = lines[0] + scores: list[tuple[int, str]] = [] + for d in _DEFAULT_DELIMS: + scores.append((_csv_field_count(header, d), d)) + + max_n = max(n for n, _ in scores) + if max_n <= 1: + return tpl or sniff_delimiter(header) + + at_max = [d for n, d in scores if n == max_n] + if tpl and tpl in at_max: + return tpl + return at_max[0] + + def _split_first_lines(text: str, max_lines: int = 5) -> List[str]: lines: List[str] = [] for line in text.splitlines(): diff --git a/backend/csv_parser/executor.py b/backend/csv_parser/executor.py index 67c78c7..6a63874 100644 --- a/backend/csv_parser/executor.py +++ b/backend/csv_parser/executor.py @@ -11,7 +11,7 @@ from typing import Any import logging -from csv_parser.core import iter_csv_dict_rows +from csv_parser.core import iter_csv_dict_rows, resolve_effective_csv_delimiter from csv_parser.import_row_processing import ( aggregate_mapped_rows, resolve_import_row_processing, @@ -97,7 +97,8 @@ def run_universal_csv_import( if tc is not None and not isinstance(tc, dict): tc = None - delim = mapping.get("delimiter") or "," + tpl_delim = str(mapping.get("delimiter") or ",").strip() or "," + delim = resolve_effective_csv_delimiter(text, tpl_delim) has_header = mapping.get("has_header", True) if module == "nutrition": diff --git a/backend/routers/csv_import.py b/backend/routers/csv_import.py index b3ab67a..c6dc535 100644 --- a/backend/routers/csv_import.py +++ b/backend/routers/csv_import.py @@ -29,6 +29,7 @@ from csv_parser.core import ( iter_csv_dict_rows, normalize_header_for_signature, parse_csv_sample, + resolve_effective_csv_delimiter, ) from csv_parser.type_converter import build_row_after_mapping, diagnose_row_mapping from csv_parser.field_units import source_unit_choices_for_field @@ -393,7 +394,8 @@ async def csv_import_diagnose( tc = m.get("type_conversions") if not isinstance(tc, dict): tc = {} - delim = str(m.get("delimiter") or ",") + tpl_delim = str(m.get("delimiter") or ",").strip() or "," + delim = resolve_effective_csv_delimiter(text, tpl_delim) exec_module = str(m["module"]) rows_out: list[dict[str, Any]] = [] @@ -418,6 +420,7 @@ async def csv_import_diagnose( "mapping_id": mapping_id, "mapping_name": m.get("mapping_name"), "module": exec_module, + "delimiter_template": tpl_delim, "delimiter_used": delim, "has_header": bool(m.get("has_header", True)), "rows_diagnosed": len(rows_out), diff --git a/backend/tests/test_csv_parser_core.py b/backend/tests/test_csv_parser_core.py index 3e27673..917510f 100644 --- a/backend/tests/test_csv_parser_core.py +++ b/backend/tests/test_csv_parser_core.py @@ -11,6 +11,7 @@ from csv_parser.core import ( headers_signature_rank_metrics, get_csv_import_limits, iter_csv_dict_rows, + resolve_effective_csv_delimiter, ) from csv_parser.field_units import source_unit_choices_for_field from csv_parser.mapping_suggest import build_type_conversions_for_mapping @@ -29,6 +30,20 @@ def test_sniff_delimiter(): assert sniff_delimiter("a,b,c") == "," +def test_resolve_effective_csv_delimiter_semicolon_file_comma_template(): + """DE-Apple: «;» in der Datei, englische Vorlage speichert «,».""" + header = "Workout Type;Start;End;Duration;Aktive Energie (kJ)" + row = "Laufen;2026-04-17 16:25;2026-04-17 17:00;00:30:00;500" + text = header + "\n" + row + "\n" + assert resolve_effective_csv_delimiter(text, ",") == ";" + assert resolve_effective_csv_delimiter(text, None) == ";" + + +def test_resolve_effective_csv_delimiter_comma_file_keeps_template(): + text = "Workout Type,Start,End\nWalk,2026-04-17 16:25,2026-04-17 17:00\n" + assert resolve_effective_csv_delimiter(text, ",") == "," + + def test_parse_csv_sample_header(): text = "Date;kcal\n2024-01-01;2000\n" headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)