diff --git a/backend/csv_parser/core.py b/backend/csv_parser/core.py index 876dc11..3387bd1 100644 --- a/backend/csv_parser/core.py +++ b/backend/csv_parser/core.py @@ -7,7 +7,7 @@ from __future__ import annotations import csv import io import re -from typing import Any, Dict, Iterator, List, Tuple +from typing import Any, Dict, Iterator, List, Sequence, Tuple _DEFAULT_DELIMS = [",", ";", "\t"] @@ -115,7 +115,7 @@ def column_signature(headers: List[str]) -> List[str]: def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -> float: - """Jaccard-Überlappung 0..1.""" + """Jaccard-Überlappung 0..1 (|A∩B|/|A∪B|). Fällt stark, wenn die CSV viele Zusatzspalten hat.""" a, b = set(sig_csv), set(sig_template) if not a and not b: return 1.0 @@ -126,6 +126,46 @@ def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) - return inter / union if union else 0.0 +def headers_signature_template_recall(sig_csv: Sequence[str], sig_template: Sequence[str]) -> float: + """ + Anteil der Template-Spalten (Signatur), die in der CSV vorkommen: |A∩B|/|B|. + 100 %, sobald alle für die Vorlage relevanten Spalten in der Datei sind — unabhängig von + Zusatzspalten (Gewicht + Ernährung in einer Datei erzeugt keinen „Abzug“ für die jeweilige Vorlage). + """ + a = set(sig_csv) + b = {normalize_header_for_signature(str(x)) for x in sig_template} + b.discard("") + if not b: + return 1.0 if not a else 0.0 + inter = len(a & b) + return inter / len(b) + + +def headers_signature_rank_metrics(sig_csv: List[str], sig_template: List[str]) -> dict[str, Any]: + """ + Einheitliche Kennzahlen für Vorlagen-Ranking und UI. + confidence = template_recall (empfohlen für Anzeige / Sortierung primär). + """ + a = set(sig_csv) + b = {normalize_header_for_signature(str(x)) for x in sig_template} + b.discard("") + inter = a & b + n_inter = len(inter) + n_b = len(b) + n_a = len(a) + union = len(a | b) + template_recall = n_inter / n_b if n_b else (1.0 if not n_a else 0.0) + jaccard = n_inter / union if union else 0.0 + return { + "confidence": round(template_recall, 4), + "template_recall": round(template_recall, 4), + "jaccard": round(jaccard, 4), + "columns_matched": n_inter, + "columns_in_template": n_b, + "columns_in_csv": n_a, + } + + def get_csv_import_limits(conn_row: dict | None) -> dict[str, int]: """Liest Limits aus system_config.csv_import; Fallback bei fehlendem Key.""" defaults = {"max_rows_per_file": 50_000, "max_file_bytes": 52_428_800} diff --git a/backend/routers/admin_csv_templates.py b/backend/routers/admin_csv_templates.py index 1c55871..14ab754 100644 --- a/backend/routers/admin_csv_templates.py +++ b/backend/routers/admin_csv_templates.py @@ -15,7 +15,7 @@ from csv_parser.core import ( column_signature, decode_raw_bytes, get_csv_import_limits, - headers_signature_match_score, + headers_signature_rank_metrics, normalize_header_for_signature, parse_csv_sample, ) @@ -134,7 +134,7 @@ async def admin_analyze_csv_for_template( ): """ CSV hochladen wie im Nutzer-Import: Spalten + Vorschau + Vorschläge für field_mappings - und type_conversions. Optional Seed-Vorlage (ID) oder beste Jaccard-Systemvorlage für das Modul. + und type_conversions. Optional Seed-Vorlage (ID) oder beste Systemvorlage (Abdeckung, dann Jaccard). """ _ = session if not get_module_definition(module): @@ -191,15 +191,15 @@ async def admin_analyze_csv_for_template( ) rows = [r2d(r) for r in cur.fetchall()] best: dict | None = None - best_score = -1.0 + best_key: tuple[float, int, float] = (-1.0, -1, -1.0) for t in rows: t_sig = list(t.get("column_signature") or []) - t_norm = sorted({normalize_header_for_signature(str(s)) for s in t_sig}) - score = headers_signature_match_score(sig, t_norm) - if score > best_score: - best_score = score + m = headers_signature_rank_metrics(sig, t_sig) + key = (m["confidence"], m["columns_matched"], m["jaccard"]) + if key > best_key: + best_key = key best = t - if best and best_score > 0: + if best and best_key[0] > 0: seed_row = best seed_fm = (seed_row or {}).get("field_mappings") or {} @@ -214,13 +214,17 @@ async def admin_analyze_csv_for_template( seed_meta = None if seed_row: - t_sig = [normalize_header_for_signature(str(s)) for s in (seed_row.get("column_signature") or [])] + t_sig = list(seed_row.get("column_signature") or []) + sm = headers_signature_rank_metrics(sig, t_sig) seed_meta = { "id": seed_row["id"], "mapping_name": seed_row.get("mapping_name"), - "confidence": round(headers_signature_match_score(sig, sorted(set(t_sig))), 4) - if t_sig - else 0.0, + "confidence": sm["confidence"], + "template_recall": sm["template_recall"], + "jaccard": sm["jaccard"], + "columns_matched": sm["columns_matched"], + "columns_in_template": sm["columns_in_template"], + "columns_in_csv": sm["columns_in_csv"], } return { diff --git a/backend/routers/csv_import.py b/backend/routers/csv_import.py index 3777884..20c23fc 100644 --- a/backend/routers/csv_import.py +++ b/backend/routers/csv_import.py @@ -20,7 +20,7 @@ from csv_parser.core import ( decode_raw_bytes, column_signature, get_csv_import_limits, - headers_signature_match_score, + headers_signature_rank_metrics, normalize_header_for_signature, parse_csv_sample, ) @@ -247,19 +247,29 @@ async def analyze_csv( ranked = [] for t in templates: t_sig = list(t["column_signature"]) if t["column_signature"] else [] - t_norm = sorted({normalize_header_for_signature(str(s)) for s in t_sig}) - score = headers_signature_match_score(sig, t_norm) + metrics = headers_signature_rank_metrics(sig, t_sig) ranked.append( { "mapping_id": t["id"], "module": t["module"], "mapping_name": t["mapping_name"], "is_system": bool(t.get("is_system")), - "confidence": round(score, 4), - "match_type": "signature_jaccard", + "confidence": metrics["confidence"], + "template_recall": metrics["template_recall"], + "jaccard": metrics["jaccard"], + "columns_matched": metrics["columns_matched"], + "columns_in_template": metrics["columns_in_template"], + "columns_in_csv": metrics["columns_in_csv"], + "match_type": "template_recall", } ) - ranked.sort(key=lambda x: -x["confidence"]) + ranked.sort( + key=lambda x: ( + -(x.get("confidence") or 0), + -(x.get("columns_matched") or 0), + -(x.get("jaccard") or 0), + ), + ) top = ranked[:25] recommended = top[0] if top and (top[0]["confidence"] or 0) > 0 else None diff --git a/backend/tests/test_csv_parser_core.py b/backend/tests/test_csv_parser_core.py index 08ec48e..8feac21 100644 --- a/backend/tests/test_csv_parser_core.py +++ b/backend/tests/test_csv_parser_core.py @@ -8,6 +8,7 @@ from csv_parser.core import ( parse_csv_sample, column_signature, headers_signature_match_score, + headers_signature_rank_metrics, get_csv_import_limits, iter_csv_dict_rows, ) @@ -46,6 +47,20 @@ def test_jaccard(): assert headers_signature_match_score(s1, s2) == pytest.approx(2 / 3) +def test_template_recall_full_when_csv_has_extra_columns(): + """Alle Template-Spalten in der CSV → Recall 1.0; Jaccard niedriger bei vielen Zusatzspalten.""" + csv_sig = column_signature( + ["D", "E", "F", "Extra1", "Extra2", "Extra3", "Extra4", "Extra5"] + ) + tmpl_sig = column_signature(["d", "e", "f"]) + m = headers_signature_rank_metrics(csv_sig, tmpl_sig) + assert m["confidence"] == 1.0 + assert m["template_recall"] == 1.0 + assert m["columns_matched"] == 3 + assert m["columns_in_template"] == 3 + assert m["jaccard"] == pytest.approx(3 / 8) + + def test_get_csv_import_limits_default(): assert get_csv_import_limits(None)["max_rows_per_file"] == 50_000 diff --git a/frontend/src/pages/AdminCsvTemplateEditorPage.jsx b/frontend/src/pages/AdminCsvTemplateEditorPage.jsx index 5d8d94d..5b03a3a 100644 --- a/frontend/src/pages/AdminCsvTemplateEditorPage.jsx +++ b/frontend/src/pages/AdminCsvTemplateEditorPage.jsx @@ -2,6 +2,7 @@ import { useEffect, useMemo, useState } from 'react' import { Link, useNavigate, useParams } from 'react-router-dom' import { ArrowLeft, FileSpreadsheet, Loader2, Save, Trash2 } from 'lucide-react' import { api } from '../utils/api' +import { csvPreviewTdStyle } from '../utils/csvPreviewCells' const MODULE_LABEL = { nutrition: 'Ernährung', @@ -38,16 +39,7 @@ function SampleTable({ sampleRows, columns }) { {sampleRows.slice(0, 5).map((row, i) => ( {showCols.map((c) => ( - + {row[c] ?? '—'} ))} @@ -298,7 +290,7 @@ export default function AdminCsvTemplateEditorPage() { value={module} disabled={!isNew} onChange={(e) => setModule(e.target.value)} - style={{ width: '100%', marginTop: 8 }} + style={{ width: '100%', marginTop: 8, textAlign: 'left', minHeight: 46, padding: '11px 14px' }} > {modules.map((m) => ( + {seedOptions.map((s) => (