mitai-jinkendo/backend/csv_parser/core.py
Lars 851018b3b9
All checks were successful
Deploy Development / deploy (push) Successful in 52s
Build Test / lint-backend (push) Successful in 0s
Build Test / build-frontend (push) Successful in 15s
feat(csv_import): Enhance CSV import functionality with new endpoint and parsing improvements
- Updated version for csv_import to 0.2.0, reflecting new features.
- Implemented a new POST endpoint for universal CSV import, supporting nutrition, weight, and blood pressure modules.
- Added CSV parsing function to yield rows as dictionaries for easier data handling.
- Enhanced error handling and logging for import operations.
- Introduced tests for the new CSV parsing functionality to ensure reliability.
2026-04-10 06:03:21 +02:00

158 lines
4.6 KiB
Python

"""
CSV bytes → text, delimiter sniffing, strukturierte Erstzeilen für Analyse (Issue #21).
"""
from __future__ import annotations
import csv
import io
import re
from typing import Any, Dict, Iterator, List, Tuple
_DEFAULT_DELIMS = [",", ";", "\t"]
def decode_raw_bytes(raw: bytes) -> str:
"""UTF-8 bevorzugt, Fallback Latin-1; BOM entfernen."""
if not raw:
return ""
for enc in ("utf-8-sig", "utf-8", "latin-1"):
try:
text = raw.decode(enc)
break
except UnicodeDecodeError:
text = ""
continue
else:
text = raw.decode("utf-8", errors="replace")
if text.startswith("\ufeff"):
text = text[1:]
return text
def sniff_delimiter(sample_line: str) -> str:
"""
Heuristik: Zähle Vorkommen der Kandidaten in der ersten Datenzeile.
Kein csv.Sniffer (robuster gegen kurze Zeilen).
"""
if not sample_line or not sample_line.strip():
return ","
best = ","
best_count = -1
for d in _DEFAULT_DELIMS:
c = sample_line.count(d)
if c > best_count:
best_count = c
best = d
return best
def _split_first_lines(text: str, max_lines: int = 5) -> List[str]:
lines: List[str] = []
for line in text.splitlines():
if line.strip():
lines.append(line)
if len(lines) >= max_lines:
break
return lines
def parse_csv_sample(
text: str,
delimiter: str | None = None,
has_header: bool = True,
max_data_rows: int = 5,
) -> Tuple[List[str], List[dict[str, str]], str]:
"""
Gibt (headers, rows_as_dicts, verwendetes_delimiter) zurück.
rows sind Rohstrings pro Zelle.
"""
lines = _split_first_lines(text, max_lines=50)
if not lines:
return [], [], ","
delim = delimiter if delimiter is not None else sniff_delimiter(lines[0])
reader = csv.reader(io.StringIO(text.replace("\r\n", "\n").replace("\r", "\n")), delimiter=delim)
rows_raw: List[List[str]] = []
for i, row in enumerate(reader):
if i >= 1 + max_data_rows + (1 if has_header else 0):
break
if not any(c.strip() for c in row):
continue
rows_raw.append(row)
if not rows_raw:
return [], [], delim
if has_header:
headers = [h.strip() for h in rows_raw[0]]
data = rows_raw[1 : 1 + max_data_rows]
else:
n = len(rows_raw[0])
headers = [f"col_{i}" for i in range(n)]
data = rows_raw[:max_data_rows]
dict_rows: List[dict[str, str]] = []
for r in data:
row_dict: dict[str, str] = {}
for j, h in enumerate(headers):
row_dict[h] = r[j].strip() if j < len(r) else ""
dict_rows.append(row_dict)
return headers, dict_rows, delim
def normalize_header_for_signature(name: str) -> str:
s = name.strip().lower()
s = re.sub(r"\s+", "_", s)
s = re.sub(r"[^a-z0-9_äöüß().%-]+", "_", s)
return s.strip("_")
def column_signature(headers: List[str]) -> List[str]:
"""Sortierte normalisierte Spaltennamen für Signatur-Vergleich."""
return sorted({normalize_header_for_signature(h) for h in headers if h is not None and str(h).strip()})
def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -> float:
"""Jaccard-Überlappung 0..1."""
a, b = set(sig_csv), set(sig_template)
if not a and not b:
return 1.0
if not a or not b:
return 0.0
inter = len(a & b)
union = len(a | b)
return inter / union if union else 0.0
def get_csv_import_limits(conn_row: dict | None) -> dict[str, int]:
"""Liest Limits aus system_config.csv_import; Fallback bei fehlendem Key."""
defaults = {"max_rows_per_file": 50_000, "max_file_bytes": 52_428_800}
if not conn_row or "value" not in conn_row:
return defaults
val = conn_row["value"]
if isinstance(val, dict):
out = {**defaults, **{k: int(v) for k, v in val.items() if k in defaults}}
return out
return defaults
def iter_csv_dict_rows(
text: str,
delimiter: str,
*,
has_header: bool = True,
) -> Iterator[Dict[str, str]]:
"""Vollständige Datei zeilenweise als Dict (Header = Keys)."""
if not has_header:
raise ValueError("CSV ohne Kopfzeile wird für Import noch nicht unterstützt")
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
reader = csv.DictReader(io.StringIO(normalized), delimiter=delimiter)
for row in reader:
if row is None:
continue
if not any(v and str(v).strip() for v in row.values()):
continue
yield {k: (v or "").strip() for k, v in row.items()}