feat(csv-parser): Introduce header signature ranking metrics for enhanced CSV analysis
All checks were successful
Deploy Development / deploy (push) Successful in 52s
Build Test / lint-backend (push) Successful in 0s
Build Test / build-frontend (push) Successful in 16s

- Added new functions for calculating header signature recall and ranking metrics, improving the analysis of CSV templates.
- Updated existing CSV analysis endpoints to utilize the new ranking metrics, enhancing the accuracy of template matching.
- Refactored related code to replace Jaccard score calculations with the new metrics, providing a more comprehensive evaluation of CSV structure.
- Improved documentation for new functions to clarify their purpose and usage in the context of CSV template analysis.
This commit is contained in:
Lars 2026-04-10 07:08:21 +02:00
parent c10da55ec6
commit b4cc3cb934
7 changed files with 223 additions and 60 deletions

View File

@ -7,7 +7,7 @@ from __future__ import annotations
import csv
import io
import re
from typing import Any, Dict, Iterator, List, Tuple
from typing import Any, Dict, Iterator, List, Sequence, Tuple
_DEFAULT_DELIMS = [",", ";", "\t"]
@ -115,7 +115,7 @@ def column_signature(headers: List[str]) -> List[str]:
def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -> float:
"""Jaccard-Überlappung 0..1."""
"""Jaccard-Überlappung 0..1 (|A∩B|/|AB|). Fällt stark, wenn die CSV viele Zusatzspalten hat."""
a, b = set(sig_csv), set(sig_template)
if not a and not b:
return 1.0
@ -126,6 +126,46 @@ def headers_signature_match_score(sig_csv: List[str], sig_template: List[str]) -
return inter / union if union else 0.0
def headers_signature_template_recall(sig_csv: Sequence[str], sig_template: Sequence[str]) -> float:
"""
Anteil der Template-Spalten (Signatur), die in der CSV vorkommen: |AB|/|B|.
100 %, sobald alle für die Vorlage relevanten Spalten in der Datei sind unabhängig von
Zusatzspalten (Gewicht + Ernährung in einer Datei erzeugt keinen Abzug für die jeweilige Vorlage).
"""
a = set(sig_csv)
b = {normalize_header_for_signature(str(x)) for x in sig_template}
b.discard("")
if not b:
return 1.0 if not a else 0.0
inter = len(a & b)
return inter / len(b)
def headers_signature_rank_metrics(sig_csv: List[str], sig_template: List[str]) -> dict[str, Any]:
"""
Einheitliche Kennzahlen für Vorlagen-Ranking und UI.
confidence = template_recall (empfohlen für Anzeige / Sortierung primär).
"""
a = set(sig_csv)
b = {normalize_header_for_signature(str(x)) for x in sig_template}
b.discard("")
inter = a & b
n_inter = len(inter)
n_b = len(b)
n_a = len(a)
union = len(a | b)
template_recall = n_inter / n_b if n_b else (1.0 if not n_a else 0.0)
jaccard = n_inter / union if union else 0.0
return {
"confidence": round(template_recall, 4),
"template_recall": round(template_recall, 4),
"jaccard": round(jaccard, 4),
"columns_matched": n_inter,
"columns_in_template": n_b,
"columns_in_csv": n_a,
}
def get_csv_import_limits(conn_row: dict | None) -> dict[str, int]:
"""Liest Limits aus system_config.csv_import; Fallback bei fehlendem Key."""
defaults = {"max_rows_per_file": 50_000, "max_file_bytes": 52_428_800}

View File

@ -15,7 +15,7 @@ from csv_parser.core import (
column_signature,
decode_raw_bytes,
get_csv_import_limits,
headers_signature_match_score,
headers_signature_rank_metrics,
normalize_header_for_signature,
parse_csv_sample,
)
@ -134,7 +134,7 @@ async def admin_analyze_csv_for_template(
):
"""
CSV hochladen wie im Nutzer-Import: Spalten + Vorschau + Vorschläge für field_mappings
und type_conversions. Optional Seed-Vorlage (ID) oder beste Jaccard-Systemvorlage für das Modul.
und type_conversions. Optional Seed-Vorlage (ID) oder beste Systemvorlage (Abdeckung, dann Jaccard).
"""
_ = session
if not get_module_definition(module):
@ -191,15 +191,15 @@ async def admin_analyze_csv_for_template(
)
rows = [r2d(r) for r in cur.fetchall()]
best: dict | None = None
best_score = -1.0
best_key: tuple[float, int, float] = (-1.0, -1, -1.0)
for t in rows:
t_sig = list(t.get("column_signature") or [])
t_norm = sorted({normalize_header_for_signature(str(s)) for s in t_sig})
score = headers_signature_match_score(sig, t_norm)
if score > best_score:
best_score = score
m = headers_signature_rank_metrics(sig, t_sig)
key = (m["confidence"], m["columns_matched"], m["jaccard"])
if key > best_key:
best_key = key
best = t
if best and best_score > 0:
if best and best_key[0] > 0:
seed_row = best
seed_fm = (seed_row or {}).get("field_mappings") or {}
@ -214,13 +214,17 @@ async def admin_analyze_csv_for_template(
seed_meta = None
if seed_row:
t_sig = [normalize_header_for_signature(str(s)) for s in (seed_row.get("column_signature") or [])]
t_sig = list(seed_row.get("column_signature") or [])
sm = headers_signature_rank_metrics(sig, t_sig)
seed_meta = {
"id": seed_row["id"],
"mapping_name": seed_row.get("mapping_name"),
"confidence": round(headers_signature_match_score(sig, sorted(set(t_sig))), 4)
if t_sig
else 0.0,
"confidence": sm["confidence"],
"template_recall": sm["template_recall"],
"jaccard": sm["jaccard"],
"columns_matched": sm["columns_matched"],
"columns_in_template": sm["columns_in_template"],
"columns_in_csv": sm["columns_in_csv"],
}
return {

View File

@ -20,7 +20,7 @@ from csv_parser.core import (
decode_raw_bytes,
column_signature,
get_csv_import_limits,
headers_signature_match_score,
headers_signature_rank_metrics,
normalize_header_for_signature,
parse_csv_sample,
)
@ -247,19 +247,29 @@ async def analyze_csv(
ranked = []
for t in templates:
t_sig = list(t["column_signature"]) if t["column_signature"] else []
t_norm = sorted({normalize_header_for_signature(str(s)) for s in t_sig})
score = headers_signature_match_score(sig, t_norm)
metrics = headers_signature_rank_metrics(sig, t_sig)
ranked.append(
{
"mapping_id": t["id"],
"module": t["module"],
"mapping_name": t["mapping_name"],
"is_system": bool(t.get("is_system")),
"confidence": round(score, 4),
"match_type": "signature_jaccard",
"confidence": metrics["confidence"],
"template_recall": metrics["template_recall"],
"jaccard": metrics["jaccard"],
"columns_matched": metrics["columns_matched"],
"columns_in_template": metrics["columns_in_template"],
"columns_in_csv": metrics["columns_in_csv"],
"match_type": "template_recall",
}
)
ranked.sort(key=lambda x: -x["confidence"])
ranked.sort(
key=lambda x: (
-(x.get("confidence") or 0),
-(x.get("columns_matched") or 0),
-(x.get("jaccard") or 0),
),
)
top = ranked[:25]
recommended = top[0] if top and (top[0]["confidence"] or 0) > 0 else None

View File

@ -8,6 +8,7 @@ from csv_parser.core import (
parse_csv_sample,
column_signature,
headers_signature_match_score,
headers_signature_rank_metrics,
get_csv_import_limits,
iter_csv_dict_rows,
)
@ -46,6 +47,20 @@ def test_jaccard():
assert headers_signature_match_score(s1, s2) == pytest.approx(2 / 3)
def test_template_recall_full_when_csv_has_extra_columns():
"""Alle Template-Spalten in der CSV → Recall 1.0; Jaccard niedriger bei vielen Zusatzspalten."""
csv_sig = column_signature(
["D", "E", "F", "Extra1", "Extra2", "Extra3", "Extra4", "Extra5"]
)
tmpl_sig = column_signature(["d", "e", "f"])
m = headers_signature_rank_metrics(csv_sig, tmpl_sig)
assert m["confidence"] == 1.0
assert m["template_recall"] == 1.0
assert m["columns_matched"] == 3
assert m["columns_in_template"] == 3
assert m["jaccard"] == pytest.approx(3 / 8)
def test_get_csv_import_limits_default():
assert get_csv_import_limits(None)["max_rows_per_file"] == 50_000

View File

@ -2,6 +2,7 @@ import { useEffect, useMemo, useState } from 'react'
import { Link, useNavigate, useParams } from 'react-router-dom'
import { ArrowLeft, FileSpreadsheet, Loader2, Save, Trash2 } from 'lucide-react'
import { api } from '../utils/api'
import { csvPreviewTdStyle } from '../utils/csvPreviewCells'
const MODULE_LABEL = {
nutrition: 'Ernährung',
@ -38,16 +39,7 @@ function SampleTable({ sampleRows, columns }) {
{sampleRows.slice(0, 5).map((row, i) => (
<tr key={i}>
{showCols.map((c) => (
<td
key={c}
style={{
padding: '6px',
borderBottom: '1px solid var(--border)',
maxWidth: 140,
overflow: 'hidden',
textOverflow: 'ellipsis',
}}
>
<td key={c} style={csvPreviewTdStyle(row[c] ?? '—')}>
{row[c] ?? '—'}
</td>
))}
@ -298,7 +290,7 @@ export default function AdminCsvTemplateEditorPage() {
value={module}
disabled={!isNew}
onChange={(e) => setModule(e.target.value)}
style={{ width: '100%', marginTop: 8 }}
style={{ width: '100%', marginTop: 8, textAlign: 'left', minHeight: 46, padding: '11px 14px' }}
>
{modules.map((m) => (
<option key={m.id} value={m.id}>
@ -327,7 +319,7 @@ export default function AdminCsvTemplateEditorPage() {
Trennzeichen (optional, sonst automatisch):
<select
className="form-input"
style={{ width: '100%', marginTop: 6 }}
style={{ width: '100%', marginTop: 6, textAlign: 'left', minHeight: 44, padding: '10px 12px' }}
value={delimiterOverride}
onChange={(e) => setDelimiterOverride(e.target.value)}
>
@ -341,11 +333,11 @@ export default function AdminCsvTemplateEditorPage() {
Optional: feste Seed-Vorlage für Vorschläge:
<select
className="form-input"
style={{ width: '100%', marginTop: 6 }}
style={{ width: '100%', marginTop: 6, textAlign: 'left', minHeight: 44, padding: '10px 12px' }}
value={seedTemplateId}
onChange={(e) => setSeedTemplateId(e.target.value)}
>
<option value="">Beste passende System-Vorlage (Jaccard)</option>
<option value="">Beste passende System-Vorlage (Abdeckung der Vorlagen-Spalten)</option>
{seedOptions.map((s) => (
<option key={s.id} value={String(s.id)}>
{s.mapping_name}
@ -371,9 +363,18 @@ export default function AdminCsvTemplateEditorPage() {
)}
</button>
{seedHint && (
<p style={{ fontSize: 13, color: 'var(--text2)', marginTop: 12 }}>
Seed: <strong>{seedHint.mapping_name}</strong> · Übereinstimmung ca.{' '}
{Math.round((seedHint.confidence || 0) * 100)} %
<p style={{ fontSize: 13, color: 'var(--text2)', marginTop: 12, lineHeight: 1.5 }}>
Seed: <strong>{seedHint.mapping_name}</strong> · Vorlage abgedeckt{' '}
<strong>{Math.round((seedHint.confidence || 0) * 100)} %</strong>
{seedHint.columns_matched != null && seedHint.columns_in_template != null
? ` (${seedHint.columns_matched}/${seedHint.columns_in_template} Spalten)`
: ''}
{seedHint.jaccard != null && (
<>
{' '}
· Jaccard <strong>{Math.round(seedHint.jaccard * 100)} %</strong>
</>
)}
</p>
)}
{sampleRows.length > 0 && <SampleTable sampleRows={sampleRows} columns={columns} />}
@ -386,7 +387,7 @@ export default function AdminCsvTemplateEditorPage() {
</label>
<input
className="form-input"
style={{ width: '100%' }}
style={{ width: '100%', textAlign: 'left' }}
value={mappingName}
onChange={(e) => setMappingName(e.target.value)}
placeholder="z. B. FDDB Export 2026"
@ -396,14 +397,19 @@ export default function AdminCsvTemplateEditorPage() {
</label>
<textarea
className="form-input"
style={{ width: '100%', minHeight: 64 }}
style={{ width: '100%', minHeight: 64, textAlign: 'left' }}
value={description}
onChange={(e) => setDescription(e.target.value)}
/>
<div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 12, marginTop: 12 }}>
<label>
<span className="form-label">Trennzeichen (gespeichert)</span>
<select className="form-input" style={{ width: '100%', marginTop: 6 }} value={delimiter} onChange={(e) => setDelimiter(e.target.value)}>
<select
className="form-input"
style={{ width: '100%', marginTop: 6, textAlign: 'left', minHeight: 44, padding: '10px 12px' }}
value={delimiter}
onChange={(e) => setDelimiter(e.target.value)}
>
<option value=";">Semikolon</option>
<option value=",">Komma</option>
<option value="\t">Tab</option>
@ -413,7 +419,7 @@ export default function AdminCsvTemplateEditorPage() {
<span className="form-label">Kopfzeile</span>
<select
className="form-input"
style={{ width: '100%', marginTop: 6 }}
style={{ width: '100%', marginTop: 6, textAlign: 'left', minHeight: 44, padding: '10px 12px' }}
value={hasHeader ? 'yes' : 'no'}
onChange={(e) => setHasHeader(e.target.value === 'yes')}
>
@ -435,16 +441,25 @@ export default function AdminCsvTemplateEditorPage() {
key={col}
style={{
display: 'grid',
gridTemplateColumns: 'minmax(0, 1fr) minmax(140px, 200px)',
gap: 10,
gridTemplateColumns: 'minmax(0, 1fr) minmax(280px, min(52vw, 440px))',
gap: '10px 16px',
alignItems: 'center',
}}
>
<code style={{ fontSize: 12, wordBreak: 'break-word', color: 'var(--text2)' }}>{col}</code>
<code style={{ fontSize: 12, wordBreak: 'break-word', color: 'var(--text2)', textAlign: 'left' }}>
{col}
</code>
<select
className="form-input"
value={fieldMappings[col] || '-'}
onChange={(e) => updateMapping(col, e.target.value)}
style={{
width: '100%',
minHeight: 46,
textAlign: 'left',
padding: '11px 14px',
fontSize: 15,
}}
>
<option value="-"> ignorieren</option>
{targetOptions.map((o) => (
@ -471,7 +486,14 @@ export default function AdminCsvTemplateEditorPage() {
</p>
<textarea
className="form-input"
style={{ width: '100%', minHeight: 200, marginTop: 8, fontFamily: 'monospace', fontSize: 12 }}
style={{
width: '100%',
minHeight: 200,
marginTop: 8,
fontFamily: 'monospace',
fontSize: 12,
textAlign: 'left',
}}
value={typeConversionsText}
onChange={(e) => setTypeConversionsText(e.target.value)}
/>

View File

@ -2,6 +2,7 @@ import { useState, useMemo } from 'react'
import { useNavigate } from 'react-router-dom'
import { ArrowLeft, FileSpreadsheet, Loader2 } from 'lucide-react'
import { api } from '../utils/api'
import { csvPreviewTdStyle } from '../utils/csvPreviewCells'
/** Ziele, die der Universal-Executor bereits schreiben kann (ohne manuelle Modul-Wahl). */
const EXECUTOR_READY = new Set(['nutrition', 'weight', 'blood_pressure'])
@ -32,6 +33,11 @@ function mergeMappingChoices(detected, mapData) {
name: row.name,
is_system: row.is_system,
confidence: d.confidence ?? 0,
jaccard: d.jaccard,
template_recall: d.template_recall,
columns_matched: d.columns_matched,
columns_in_template: d.columns_in_template,
columns_in_csv: d.columns_in_csv,
})
seen.add(row.id)
}
@ -78,16 +84,7 @@ function SampleTable({ sampleRows, columns }) {
{sampleRows.slice(0, 5).map((row, i) => (
<tr key={i}>
{showCols.map((c) => (
<td
key={c}
style={{
padding: '6px',
borderBottom: '1px solid var(--border)',
maxWidth: 140,
overflow: 'hidden',
textOverflow: 'ellipsis',
}}
>
<td key={c} style={csvPreviewTdStyle(row[c] ?? '—')}>
{row[c] ?? '—'}
</td>
))}
@ -296,8 +293,24 @@ export default function UniversalCsvImportPage() {
>
<strong>Vorschlag:</strong>{' '}
{MODULE_LABEL[analyzeResult.recommended.module] || analyzeResult.recommended.module} {' '}
{analyzeResult.recommended.mapping_name} (
{Math.round((analyzeResult.recommended.confidence || 0) * 100)} % Übereinstimmung der Spalten)
{analyzeResult.recommended.mapping_name}.
<br />
<span style={{ fontSize: 13, color: 'var(--text2)', fontWeight: 500 }}>
Vorlage abgedeckt:{' '}
<strong>{Math.round((analyzeResult.recommended.confidence || 0) * 100)} %</strong>
{analyzeResult.recommended.columns_matched != null &&
analyzeResult.recommended.columns_in_template != null
? ` (${analyzeResult.recommended.columns_matched}/${analyzeResult.recommended.columns_in_template} erwartete Spalten in der Datei)`
: ''}
.{' '}
{analyzeResult.recommended.jaccard != null && (
<>
Jaccard{' '}
<strong>{Math.round(analyzeResult.recommended.jaccard * 100)} %</strong> (gesamte
Spalten-Überlappung niedriger, wenn die CSV viele Zusatzspalten hat).
</>
)}
</span>
</div>
)}
@ -307,8 +320,9 @@ export default function UniversalCsvImportPage() {
<ul style={{ margin: '8px 0 0 18px', padding: 0 }}>
{analyzeResult.detected_mappings.slice(1, 8).map((d) => (
<li key={d.mapping_id}>
{MODULE_LABEL[d.module] || d.module}: {d.mapping_name} ·{' '}
{MODULE_LABEL[d.module] || d.module}: {d.mapping_name} · Vorlage{' '}
{Math.round((d.confidence || 0) * 100)} %
{d.jaccard != null ? ` · Jaccard ${Math.round(d.jaccard * 100)} %` : ''}
</li>
))}
</ul>
@ -324,7 +338,14 @@ export default function UniversalCsvImportPage() {
className="form-input"
value={mappingId}
onChange={(e) => setMappingId(e.target.value)}
style={{ width: '100%', marginTop: 8 }}
style={{
width: '100%',
marginTop: 8,
minHeight: 48,
textAlign: 'left',
padding: '12px 14px',
fontSize: 15,
}}
>
{mappingChoices.length === 0 ? (
<option value="">Keine Vorlage geladen</option>
@ -333,7 +354,11 @@ export default function UniversalCsvImportPage() {
<option key={o.id} value={o.id}>
{MODULE_LABEL[o.module] || o.module} {o.name}
{o.is_system ? ' (System)' : ''}
{o.confidence > 0 ? ` · ${Math.round(o.confidence * 100)} %` : ''}
{o.confidence > 0
? ` · Vorlage ${Math.round(o.confidence * 100)} %${
o.jaccard != null ? ` · Jaccard ${Math.round(o.jaccard * 100)} %` : ''
}`
: ''}
{!EXECUTOR_READY.has(o.module) ? ' · Import: noch nicht hier' : ''}
</option>
))

View File

@ -0,0 +1,47 @@
/**
* Vorschau-Zellen: Zahlen rechts (tabular-nums), Text links typisch für CSV-Tabellen.
*/
function stripForNumericTest(s) {
return String(s)
.trim()
.replace(/\u00a0/g, '')
.replace(/\s/g, '')
}
/**
* Einfache Heuristik: reine Zahl / Dezimal (ein Punkt oder ein Komma als Dezimaltrenner).
* Keine Datums-/Zeitstrings (mehrere Punkte ohne klares Muster werden nicht als Zahl gewertet).
*/
export function isCsvPreviewNumericCell(val) {
if (val == null || val === '—') return false
let s = stripForNumericTest(val)
if (!s) return false
if (/^\d{1,2}[./]\d{1,2}[./]\d{2,4}/.test(s)) return false
if (/^\d{4}-\d{2}-\d{2}/.test(s)) return false
if (/:/.test(s) && /\d/.test(s)) return false
s = s.replace(/^[-+]/, '')
const comma = (s.match(/,/g) || []).length
const dot = (s.match(/\./g) || []).length
if (comma > 1 || dot > 1 || (comma >= 1 && dot >= 1)) return false
s = s.replace(',', '.')
if (!/^\d*\.?\d+$/.test(s) && !/^\d+\.\d+$/.test(s)) return false
return Number.isFinite(Number(s))
}
/** td-Style für CSV-Vorschau (Import + Admin-Editor) */
export function csvPreviewTdStyle(val, extra = {}) {
const numeric = isCsvPreviewNumericCell(val)
return {
padding: '6px 8px',
borderBottom: '1px solid var(--border)',
maxWidth: 220,
overflow: 'hidden',
textOverflow: 'ellipsis',
textAlign: numeric ? 'right' : 'left',
fontVariantNumeric: numeric ? 'tabular-nums' : undefined,
...extra,
}
}