feat: implement effective CSV delimiter resolution for imports
- Added `resolve_effective_csv_delimiter` function to determine the correct delimiter based on the uploaded file and template. - Updated CSV import logic to utilize the new delimiter resolution method, ensuring accurate parsing of CSV files with varying delimiters. - Enhanced documentation to reflect changes in delimiter handling. - Added unit tests for the new delimiter resolution functionality.
This commit is contained in:
parent
0ad3ddd627
commit
7226e04e9c
|
|
@ -18,6 +18,7 @@ Dieses Dokument ist **normativ für Agenten**, die ein neues Import-Zielmodul an
|
||||||
| Admin-Systemvorlagen | `backend/routers/admin_csv_templates.py` |
|
| Admin-Systemvorlagen | `backend/routers/admin_csv_templates.py` |
|
||||||
| Nutzer-Import (Profil-Mappings) | `backend/routers/csv_import.py` |
|
| Nutzer-Import (Profil-Mappings) | `backend/routers/csv_import.py` |
|
||||||
| Vorlagen-Validierung (strukturell + Sample) | `backend/csv_parser/template_validator.py` (`validate_csv_template`) |
|
| Vorlagen-Validierung (strukturell + Sample) | `backend/csv_parser/template_validator.py` (`validate_csv_template`) |
|
||||||
|
| Effektives Listentrennzeichen | `backend/csv_parser/core.py` (`resolve_effective_csv_delimiter`) — Datei kann `;` (z. B. Apple DE) haben, Vorlage `,` (EN); Import/Diagnose **nicht** nur das gespeicherte Trennzeichen blind nutzen. |
|
||||||
|
|
||||||
**Single Source of Truth** für erlaubte Zielfelder, Typen und Duplikat-Keys ist **`module_registry.py`**. Keine parallele Feldliste in Routern duplizieren.
|
**Single Source of Truth** für erlaubte Zielfelder, Typen und Duplikat-Keys ist **`module_registry.py`**. Keine parallele Feldliste in Routern duplizieren.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,46 @@ def sniff_delimiter(sample_line: str) -> str:
|
||||||
return best
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def _csv_field_count(line: str, delimiter: str) -> int:
|
||||||
|
"""Anzahl Felder in einer Zeile (csv.reader, berücksichtigt Anführungszeichen)."""
|
||||||
|
if not line or not line.strip():
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
row = next(csv.reader(io.StringIO(line), delimiter=delimiter))
|
||||||
|
except StopIteration:
|
||||||
|
return 0
|
||||||
|
return len(row)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_effective_csv_delimiter(text: str, template_delimiter: str | None = None) -> str:
|
||||||
|
"""
|
||||||
|
Trennzeichen für die hochgeladene Datei wählen. Gespeicherte Vorlagen haben oft «,»
|
||||||
|
(Apple EN), tatsächliche Exporte je nach Region «;» (Apple DE / Excel) — mit falschem
|
||||||
|
Zeichen wird die Kopfzeile zu **einer** Spalte und das Mapping bricht vollständig.
|
||||||
|
"""
|
||||||
|
tpl = (template_delimiter or "").strip()
|
||||||
|
if tpl not in _DEFAULT_DELIMS:
|
||||||
|
tpl = None
|
||||||
|
|
||||||
|
lines = _split_first_lines(text, max_lines=5)
|
||||||
|
if not lines:
|
||||||
|
return tpl or ","
|
||||||
|
|
||||||
|
header = lines[0]
|
||||||
|
scores: list[tuple[int, str]] = []
|
||||||
|
for d in _DEFAULT_DELIMS:
|
||||||
|
scores.append((_csv_field_count(header, d), d))
|
||||||
|
|
||||||
|
max_n = max(n for n, _ in scores)
|
||||||
|
if max_n <= 1:
|
||||||
|
return tpl or sniff_delimiter(header)
|
||||||
|
|
||||||
|
at_max = [d for n, d in scores if n == max_n]
|
||||||
|
if tpl and tpl in at_max:
|
||||||
|
return tpl
|
||||||
|
return at_max[0]
|
||||||
|
|
||||||
|
|
||||||
def _split_first_lines(text: str, max_lines: int = 5) -> List[str]:
|
def _split_first_lines(text: str, max_lines: int = 5) -> List[str]:
|
||||||
lines: List[str] = []
|
lines: List[str] = []
|
||||||
for line in text.splitlines():
|
for line in text.splitlines():
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from typing import Any
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from csv_parser.core import iter_csv_dict_rows
|
from csv_parser.core import iter_csv_dict_rows, resolve_effective_csv_delimiter
|
||||||
from csv_parser.import_row_processing import (
|
from csv_parser.import_row_processing import (
|
||||||
aggregate_mapped_rows,
|
aggregate_mapped_rows,
|
||||||
resolve_import_row_processing,
|
resolve_import_row_processing,
|
||||||
|
|
@ -97,7 +97,8 @@ def run_universal_csv_import(
|
||||||
if tc is not None and not isinstance(tc, dict):
|
if tc is not None and not isinstance(tc, dict):
|
||||||
tc = None
|
tc = None
|
||||||
|
|
||||||
delim = mapping.get("delimiter") or ","
|
tpl_delim = str(mapping.get("delimiter") or ",").strip() or ","
|
||||||
|
delim = resolve_effective_csv_delimiter(text, tpl_delim)
|
||||||
has_header = mapping.get("has_header", True)
|
has_header = mapping.get("has_header", True)
|
||||||
|
|
||||||
if module == "nutrition":
|
if module == "nutrition":
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,7 @@ from csv_parser.core import (
|
||||||
iter_csv_dict_rows,
|
iter_csv_dict_rows,
|
||||||
normalize_header_for_signature,
|
normalize_header_for_signature,
|
||||||
parse_csv_sample,
|
parse_csv_sample,
|
||||||
|
resolve_effective_csv_delimiter,
|
||||||
)
|
)
|
||||||
from csv_parser.type_converter import build_row_after_mapping, diagnose_row_mapping
|
from csv_parser.type_converter import build_row_after_mapping, diagnose_row_mapping
|
||||||
from csv_parser.field_units import source_unit_choices_for_field
|
from csv_parser.field_units import source_unit_choices_for_field
|
||||||
|
|
@ -393,7 +394,8 @@ async def csv_import_diagnose(
|
||||||
tc = m.get("type_conversions")
|
tc = m.get("type_conversions")
|
||||||
if not isinstance(tc, dict):
|
if not isinstance(tc, dict):
|
||||||
tc = {}
|
tc = {}
|
||||||
delim = str(m.get("delimiter") or ",")
|
tpl_delim = str(m.get("delimiter") or ",").strip() or ","
|
||||||
|
delim = resolve_effective_csv_delimiter(text, tpl_delim)
|
||||||
exec_module = str(m["module"])
|
exec_module = str(m["module"])
|
||||||
|
|
||||||
rows_out: list[dict[str, Any]] = []
|
rows_out: list[dict[str, Any]] = []
|
||||||
|
|
@ -418,6 +420,7 @@ async def csv_import_diagnose(
|
||||||
"mapping_id": mapping_id,
|
"mapping_id": mapping_id,
|
||||||
"mapping_name": m.get("mapping_name"),
|
"mapping_name": m.get("mapping_name"),
|
||||||
"module": exec_module,
|
"module": exec_module,
|
||||||
|
"delimiter_template": tpl_delim,
|
||||||
"delimiter_used": delim,
|
"delimiter_used": delim,
|
||||||
"has_header": bool(m.get("has_header", True)),
|
"has_header": bool(m.get("has_header", True)),
|
||||||
"rows_diagnosed": len(rows_out),
|
"rows_diagnosed": len(rows_out),
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ from csv_parser.core import (
|
||||||
headers_signature_rank_metrics,
|
headers_signature_rank_metrics,
|
||||||
get_csv_import_limits,
|
get_csv_import_limits,
|
||||||
iter_csv_dict_rows,
|
iter_csv_dict_rows,
|
||||||
|
resolve_effective_csv_delimiter,
|
||||||
)
|
)
|
||||||
from csv_parser.field_units import source_unit_choices_for_field
|
from csv_parser.field_units import source_unit_choices_for_field
|
||||||
from csv_parser.mapping_suggest import build_type_conversions_for_mapping
|
from csv_parser.mapping_suggest import build_type_conversions_for_mapping
|
||||||
|
|
@ -29,6 +30,20 @@ def test_sniff_delimiter():
|
||||||
assert sniff_delimiter("a,b,c") == ","
|
assert sniff_delimiter("a,b,c") == ","
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_effective_csv_delimiter_semicolon_file_comma_template():
|
||||||
|
"""DE-Apple: «;» in der Datei, englische Vorlage speichert «,»."""
|
||||||
|
header = "Workout Type;Start;End;Duration;Aktive Energie (kJ)"
|
||||||
|
row = "Laufen;2026-04-17 16:25;2026-04-17 17:00;00:30:00;500"
|
||||||
|
text = header + "\n" + row + "\n"
|
||||||
|
assert resolve_effective_csv_delimiter(text, ",") == ";"
|
||||||
|
assert resolve_effective_csv_delimiter(text, None) == ";"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_effective_csv_delimiter_comma_file_keeps_template():
|
||||||
|
text = "Workout Type,Start,End\nWalk,2026-04-17 16:25,2026-04-17 17:00\n"
|
||||||
|
assert resolve_effective_csv_delimiter(text, ",") == ","
|
||||||
|
|
||||||
|
|
||||||
def test_parse_csv_sample_header():
|
def test_parse_csv_sample_header():
|
||||||
text = "Date;kcal\n2024-01-01;2000\n"
|
text = "Date;kcal\n2024-01-01;2000\n"
|
||||||
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
headers, rows, delim = parse_csv_sample(text, delimiter=";", max_data_rows=3)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user