mindnet_obsidian/src/vocab/parseEdgeVocabulary.ts

import type { EdgeVocabulary } from "./types";

const BACKTICK_RE = /`([^`]+)`/g;

/**
 * Parses markdown tables containing edge vocabulary definitions.
 *
 * Expected format:
 * | System-Typ (Canonical) | Inverser Typ | Erlaubte Aliasse (User) | Beschreibung ... |
 * | **`caused_by`** | `resulted_in` | `ausgelöst_durch`, `wegen`, ... | ... |
 *
 * Rules:
 * - Extract all backticked tokens from each row
 * - First token = canonical (may be wrapped in ** **)
 * - Second token = inverse (optional)
 * - Remaining tokens = aliases (skip if cell contains "(Kein Alias)")
 * - Skip rows with less than 1 token (with warning counter)
 * - Store canonical as-is, but use lowercase for lookup keys
 */
export function parseEdgeVocabulary(md: string): EdgeVocabulary {
  const lines = md.split(/\r?\n/);
  const byCanonical = new Map<string, { canonical: string; inverse?: string; aliases: string[]; description?: string; category?: string }>();
  const aliasToCanonical = new Map<string, string>();

  let skippedRows = 0;
  let currentCategory: string | null = null; // Track current H3 category

  for (const line of lines) {
    // Detect H3 headings (###) as category separators
    const h3Match = line.match(/^###\s+(.+)$/);
    if (h3Match && h3Match[1]) {
      currentCategory = h3Match[1].trim();
      continue;
    }

    // Skip header separator rows (e.g., "| :--- | :--- |")
    if (/^\s*\|[\s:|-]+\|\s*$/.test(line)) {
      continue;
    }

    // Only process table rows
    if (!line.trim().startsWith("|")) {
      continue;
    }

    // Skip header rows (contains "Canonical", "System-Typ", "Beschreibung", "Kategorie", etc.)
    // Check for common header keywords
    if (/canonical|system-typ|beschreibung|kategorie|category|description|inverser|aliasse/i.test(line)) {
      continue;
    }

    // Extract all backticked tokens
    const tokens: string[] = [];
    let match: RegExpExecArray | null;
    BACKTICK_RE.lastIndex = 0;
    while ((match = BACKTICK_RE.exec(line)) !== null) {
      if (match[1]) {
        const token = match[1].trim();
        if (token) {
          tokens.push(token);
        }
      }
    }

    // Skip rows with no tokens or only one token (need at least canonical)
    if (tokens.length < 1) {
      skippedRows++;
      continue;
    }

    // Parse table cells (split by |, skip first and last empty cells)
    const cells = line.split("|").map(c => c.trim()).filter(c => c);

    // Extract description and category from cells
    // Expected order: Canonical | Inverse | Aliases | Description | Category (optional)
    let description: string | undefined = undefined;
    let category: string | undefined = undefined;

    // Try to extract from cells after aliases (index 3+)
    // Description is usually the first text cell after aliases
    // Category might be in brackets, short, or in a separate column
    for (let i = 3; i < cells.length; i++) {
      const cell = cells[i];
      if (!cell || !cell.trim()) continue;

      const trimmed = cell.trim();

      // Check if this looks like a category:
      // - Short text (< 40 chars)
      // - Might be in brackets [Category]
      // - Might be all caps
      // - Might match category pattern
      const looksLikeCategory =
        trimmed.length < 40 && (
          /^\[.+\]$/.test(trimmed) || // [Category]
          trimmed === trimmed.toUpperCase() || // ALL CAPS
          /^[A-ZÄÖÜ][a-zäöüß]+(\s+[A-ZÄÖÜ][a-zäöüß]+)*$/.test(trimmed) // Title Case
        );

      if (looksLikeCategory && !category) {
        // Remove brackets if present
        category = trimmed.replace(/^\[|\]$/g, "");
      } else if (!description && trimmed.length > 0) {
        // First substantial cell is likely description
        // Remove markdown formatting but keep content
        description = trimmed
          .replace(/\*\*/g, "") // Remove bold
          .replace(/\*/g, "") // Remove italic
          .replace(/`/g, "") // Remove code
          .trim();
      }
    }

    // Check if aliases cell contains "(Kein Alias)"
    const hasNoAliases = /\(Kein Alias\)/i.test(line);

    const canonical = tokens[0];
    if (!canonical) {
      skippedRows++;
      continue;
    }

    const inverse = tokens.length >= 2 && tokens[1] ? tokens[1] : undefined;

    // Extract aliases: all tokens after the first two, but only if not "(Kein Alias)"
    const aliases: string[] = [];
    if (!hasNoAliases && tokens.length > 2) {
      for (let i = 2; i < tokens.length; i++) {
        const alias = tokens[i];
        if (alias) {
          aliases.push(alias);
        }
      }
    }

    // Store canonical entry with description and category
    // Use currentCategory from H3 heading if available, otherwise use extracted category
    const finalCategory = currentCategory || category;

    byCanonical.set(canonical, {
      canonical,
      inverse,
      aliases,
      description,
      category: finalCategory,
    });

    // Build alias-to-canonical mapping (case-insensitive keys)
    for (const alias of aliases) {
      if (alias) {
        const lowerAlias = alias.toLowerCase();
        if (!aliasToCanonical.has(lowerAlias)) {
          aliasToCanonical.set(lowerAlias, canonical);
        }
      }
    }
  }

  if (skippedRows > 0) {
    // Only warn if there are actually problematic rows (not just header/separator rows)
    // Header and separator rows are expected and should not trigger warnings
    console.debug(`parseEdgeVocabulary: Skipped ${skippedRows} data rows with insufficient tokens (this is normal if the file contains empty or malformed table rows)`);
  }

  return {
    byCanonical,
    aliasToCanonical,
  };
}