import type { EdgeVocabulary } from "./types"; const BACKTICK_RE = /`([^`]+)`/g; /** * Parses markdown tables containing edge vocabulary definitions. * * Expected format: * | System-Typ (Canonical) | Inverser Typ | Erlaubte Aliasse (User) | Beschreibung ... | * | **`caused_by`** | `resulted_in` | `ausgelöst_durch`, `wegen`, ... | ... | * * Rules: * - Extract all backticked tokens from each row * - First token = canonical (may be wrapped in ** **) * - Second token = inverse (optional) * - Remaining tokens = aliases (skip if cell contains "(Kein Alias)") * - Skip rows with less than 1 token (with warning counter) * - Store canonical as-is, but use lowercase for lookup keys */ export function parseEdgeVocabulary(md: string): EdgeVocabulary { const lines = md.split(/\r?\n/); const byCanonical = new Map(); const aliasToCanonical = new Map(); let skippedRows = 0; let currentCategory: string | null = null; // Track current H3 category for (const line of lines) { // Detect H3 headings (###) as category separators const h3Match = line.match(/^###\s+(.+)$/); if (h3Match && h3Match[1]) { currentCategory = h3Match[1].trim(); continue; } // Skip header separator rows (e.g., "| :--- | :--- |") if (/^\s*\|[\s:|-]+\|\s*$/.test(line)) { continue; } // Only process table rows if (!line.trim().startsWith("|")) { continue; } // Skip header rows (contains "Canonical", "System-Typ", "Beschreibung", "Kategorie", etc.) // Check for common header keywords if (/canonical|system-typ|beschreibung|kategorie|category|description|inverser|aliasse/i.test(line)) { continue; } // Extract all backticked tokens const tokens: string[] = []; let match: RegExpExecArray | null; BACKTICK_RE.lastIndex = 0; while ((match = BACKTICK_RE.exec(line)) !== null) { if (match[1]) { const token = match[1].trim(); if (token) { tokens.push(token); } } } // Skip rows with no tokens or only one token (need at least canonical) if (tokens.length < 1) { skippedRows++; continue; } // Parse table cells (split by |, skip first and last empty cells) const cells = line.split("|").map(c => c.trim()).filter(c => c); // Extract description and category from cells // Expected order: Canonical | Inverse | Aliases | Description | Category (optional) let description: string | undefined = undefined; let category: string | undefined = undefined; // Try to extract from cells after aliases (index 3+) // Description is usually the first text cell after aliases // Category might be in brackets, short, or in a separate column for (let i = 3; i < cells.length; i++) { const cell = cells[i]; if (!cell || !cell.trim()) continue; const trimmed = cell.trim(); // Check if this looks like a category: // - Short text (< 40 chars) // - Might be in brackets [Category] // - Might be all caps // - Might match category pattern const looksLikeCategory = trimmed.length < 40 && ( /^\[.+\]$/.test(trimmed) || // [Category] trimmed === trimmed.toUpperCase() || // ALL CAPS /^[A-ZÄÖÜ][a-zäöüß]+(\s+[A-ZÄÖÜ][a-zäöüß]+)*$/.test(trimmed) // Title Case ); if (looksLikeCategory && !category) { // Remove brackets if present category = trimmed.replace(/^\[|\]$/g, ""); } else if (!description && trimmed.length > 0) { // First substantial cell is likely description // Remove markdown formatting but keep content description = trimmed .replace(/\*\*/g, "") // Remove bold .replace(/\*/g, "") // Remove italic .replace(/`/g, "") // Remove code .trim(); } } // Check if aliases cell contains "(Kein Alias)" const hasNoAliases = /\(Kein Alias\)/i.test(line); const canonical = tokens[0]; if (!canonical) { skippedRows++; continue; } const inverse = tokens.length >= 2 && tokens[1] ? tokens[1] : undefined; // Extract aliases: all tokens after the first two, but only if not "(Kein Alias)" const aliases: string[] = []; if (!hasNoAliases && tokens.length > 2) { for (let i = 2; i < tokens.length; i++) { const alias = tokens[i]; if (alias) { aliases.push(alias); } } } // Store canonical entry with description and category // Use currentCategory from H3 heading if available, otherwise use extracted category const finalCategory = currentCategory || category; byCanonical.set(canonical, { canonical, inverse, aliases, description, category: finalCategory, }); // Build alias-to-canonical mapping (case-insensitive keys) for (const alias of aliases) { if (alias) { const lowerAlias = alias.toLowerCase(); if (!aliasToCanonical.has(lowerAlias)) { aliasToCanonical.set(lowerAlias, canonical); } } } } if (skippedRows > 0) { // Only warn if there are actually problematic rows (not just header/separator rows) // Header and separator rows are expected and should not trigger warnings console.debug(`parseEdgeVocabulary: Skipped ${skippedRows} data rows with insufficient tokens (this is normal if the file contains empty or malformed table rows)`); } return { byCanonical, aliasToCanonical, }; }