#!/usr/bin/env node import { execSync } from 'child_process'; import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; import { join, dirname, basename } from 'path'; import { fileURLToPath } from 'url'; import { cleanBibliography } from './bib-cleaner.mjs'; import { postProcessMarkdown } from './post-processor.mjs'; import { preprocessLatexReferences } from './reference-preprocessor.mjs'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Configuration const DEFAULT_INPUT = join(__dirname, 'input', 'main.tex'); const DEFAULT_OUTPUT = join(__dirname, 'output'); function parseArgs() { const args = process.argv.slice(2); const config = { input: DEFAULT_INPUT, output: DEFAULT_OUTPUT, clean: false }; for (const arg of args) { if (arg.startsWith('--input=')) { config.input = arg.split('=')[1]; } else if (arg.startsWith('--output=')) { config.output = arg.split('=')[1]; } else if (arg === '--clean') { config.clean = true; } } return config; } function ensureDirectory(dir) { if (!existsSync(dir)) { mkdirSync(dir, { recursive: true }); } } function cleanDirectory(dir) { if (existsSync(dir)) { execSync(`rm -rf "${dir}"/*`, { stdio: 'inherit' }); } } function preprocessLatexFile(inputFile, outputDir) { const inputDir = dirname(inputFile); const tempFile = join(outputDir, 'temp_main.tex'); console.log('๐Ÿ”„ Preprocessing LaTeX file to resolve \\input commands...'); let content = readFileSync(inputFile, 'utf8'); // Remove problematic commands that break pandoc console.log('๐Ÿงน Cleaning problematic LaTeX constructs...'); // Fix citation issues - but not in citation keys content = content.replace(/\$p_0\$(?![A-Za-z])/g, 'p0'); // Convert complex math environments to simple delimiters content = content.replace(/\$\$\\begin\{equation\*\}/g, '$$'); content = content.replace(/\\end\{equation\*\}\$\$/g, '$$'); content = content.replace(/\\begin\{equation\*\}/g, '$$'); content = content.replace(/\\end\{equation\*\}/g, '$$'); // Keep align environments intact for KaTeX support // Protect align environments by temporarily replacing them before cleaning & operators const alignBlocks = []; content = content.replace(/\\begin\{align\}([\s\S]*?)\\end\{align\}/g, (match, alignContent) => { alignBlocks.push(match); return `__ALIGN_BLOCK_${alignBlocks.length - 1}__`; }); // Now remove & operators from non-align content (outside align environments) content = content.replace(/&=/g, '='); content = content.replace(/&/g, ''); // Restore align blocks with their & operators intact alignBlocks.forEach((block, index) => { content = content.replace(`__ALIGN_BLOCK_${index}__`, block); }); // Convert LaTeX citations to Pandoc format content = content.replace(/\\cite[tp]?\{([^}]+)\}/g, (match, citations) => { // Handle multiple citations separated by commas - all become simple @citations return citations.split(',').map(cite => `@${cite.trim()}`).join(', '); }); // Handle complex \textsc with nested math - extract and simplify (but not in command definitions) content = content.replace(/\\textsc\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, content_inside, offset) => { // Skip if this is inside a \newcommand or similar definition const before = content.substring(Math.max(0, offset - 50), offset); if (before.includes('\\newcommand') || before.includes('\\renewcommand') || before.includes('\\def')) { return match; // Keep original } // Remove math delimiters inside textsc for simplification const simplified = content_inside.replace(/\\\([^)]+\\\)/g, 'MATHEXPR'); return `\\text{${simplified}}`; }); // Remove complex custom commands that pandoc can't handle content = content.replace(/\\input\{snippets\/[^}]+\}/g, '% Code snippet removed'); // Find all \input{} commands (but skip commented ones) const inputRegex = /^([^%]*?)\\input\{([^}]+)\}/gm; let match; while ((match = inputRegex.exec(content)) !== null) { const beforeInput = match[1]; const inputPath = match[2]; // Skip if the \input is commented (% appears before \input on the line) if (beforeInput.includes('%')) { continue; } let fullPath; // Skip only problematic files, let Pandoc handle macros if (inputPath.includes('snippets/')) { console.log(` Skipping: ${inputPath}`); content = content.replace(`\\input{${inputPath}}`, `% Skipped: ${inputPath}`); continue; } // Handle paths with or without .tex extension if (inputPath.endsWith('.tex')) { fullPath = join(inputDir, inputPath); } else { fullPath = join(inputDir, inputPath + '.tex'); } if (existsSync(fullPath)) { console.log(` Including: ${inputPath}`); let includedContent = readFileSync(fullPath, 'utf8'); // Clean included content too includedContent = includedContent.replace(/\$p_0\$/g, 'p0'); includedContent = includedContent.replace(/\\input\{snippets\/[^}]+\}/g, '% Code snippet removed'); // Handle complex \textsc in included content includedContent = includedContent.replace(/\\textsc\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, content_inside, offset) => { // Skip if this is inside a \newcommand or similar definition const before = includedContent.substring(Math.max(0, offset - 50), offset); if (before.includes('\\newcommand') || before.includes('\\renewcommand') || before.includes('\\def')) { return match; // Keep original } const simplified = content_inside.replace(/\\\([^)]+\\\)/g, 'MATHEXPR'); return `\\text{${simplified}}`; }); // Apply same align-preserving logic to included content const alignBlocksIncluded = []; includedContent = includedContent.replace(/\\begin\{align\}([\s\S]*?)\\end\{align\}/g, (match, alignContent) => { alignBlocksIncluded.push(match); return `__ALIGN_BLOCK_${alignBlocksIncluded.length - 1}__`; }); // Remove alignment operators from non-align content in included files includedContent = includedContent.replace(/&=/g, '='); includedContent = includedContent.replace(/&/g, ''); // Restore align blocks with their & operators intact alignBlocksIncluded.forEach((block, index) => { includedContent = includedContent.replace(`__ALIGN_BLOCK_${index}__`, block); }); // Convert math environments in included content includedContent = includedContent.replace(/\$\$\\begin\{equation\*\}/g, '$$'); includedContent = includedContent.replace(/\\end\{equation\*\}\$\$/g, '$$'); includedContent = includedContent.replace(/\\begin\{equation\*\}/g, '$$'); includedContent = includedContent.replace(/\\end\{equation\*\}/g, '$$'); // Convert citations in included content includedContent = includedContent.replace(/\\cite[tp]?\{([^}]+)\}/g, (match, citations) => { return citations.split(',').map(cite => `@${cite.trim()}`).join(', '); }); content = content.replace(`\\input{${inputPath}}`, includedContent); } else { console.log(` โš ๏ธ File not found: ${fullPath} (skipping)`); content = content.replace(`\\input{${inputPath}}`, `% File not found: ${inputPath}`); } } // Apply reference preprocessing AFTER input inclusion to ensure all references are captured console.log('๐Ÿ”ง Preprocessing LaTeX references for MDX compatibility...'); const referenceResult = preprocessLatexReferences(content); content = referenceResult.content; // Write the preprocessed file writeFileSync(tempFile, content); return tempFile; } function processBibliography(inputFile, outputDir) { const bibFile = join(dirname(inputFile), 'main.bib'); const outputBibFile = join(outputDir, 'main.bib'); if (!existsSync(bibFile)) { console.log(' โš ๏ธ No bibliography file found'); return null; } const success = cleanBibliography(bibFile, outputBibFile); return success ? outputBibFile : null; } export function convertLatexToMarkdown(inputFile, outputDir) { console.log('๐Ÿš€ Simple LaTeX to Markdown Converter'); console.log(`๐Ÿ“ Input: ${inputFile}`); console.log(`๐Ÿ“ Output: ${outputDir}`); // Check if input file exists if (!existsSync(inputFile)) { console.error(`โŒ Input file not found: ${inputFile}`); process.exit(1); } // Ensure output directory exists ensureDirectory(outputDir); try { // Check if pandoc is available execSync('pandoc --version', { stdio: 'pipe' }); } catch (error) { console.error('โŒ Pandoc not found. Please install it: brew install pandoc'); process.exit(1); } // Clean and copy bibliography const cleanBibFile = processBibliography(inputFile, outputDir); // Preprocess the LaTeX file to resolve \input commands const preprocessedFile = preprocessLatexFile(inputFile, outputDir); const inputFileName = basename(inputFile, '.tex'); const outputFile = join(outputDir, `${inputFileName}.md`); try { console.log('๐Ÿ“„ Converting with Pandoc...'); // Enhanced pandoc conversion - use tex_math_dollars for KaTeX compatibility const bibOption = cleanBibFile ? `--bibliography="${cleanBibFile}"` : ''; // Use gfm+tex_math_dollars for simple $ delimiters compatible with KaTeX const mediaDir = join(outputDir, 'assets', 'image'); ensureDirectory(mediaDir); const inputDir = dirname(inputFile); const equationFilterPath = join(__dirname, 'filters', 'equation-ids.lua'); const pandocCommand = `pandoc "${preprocessedFile}" -f latex+latex_macros -t gfm+tex_math_dollars+raw_html --shift-heading-level-by=1 --wrap=none ${bibOption} --extract-media="${mediaDir}" --resource-path="${inputDir}" --lua-filter="${equationFilterPath}" -o "${outputFile}"`; console.log(` Running: ${pandocCommand}`); execSync(pandocCommand, { stdio: 'pipe' }); // Clean up temp file execSync(`rm "${preprocessedFile}"`, { stdio: 'pipe' }); // Post-processing to fix KaTeX incompatible constructions let markdownContent = readFileSync(outputFile, 'utf8'); // Use modular post-processor with code injection markdownContent = postProcessMarkdown(markdownContent, inputDir); writeFileSync(outputFile, markdownContent); console.log(`โœ… Conversion completed: ${outputFile}`); // Show file size const stats = execSync(`wc -l "${outputFile}"`, { encoding: 'utf8' }); const lines = stats.trim().split(' ')[0]; console.log(`๐Ÿ“Š Result: ${lines} lines written`); } catch (error) { console.error('โŒ Pandoc conversion failed:'); console.error(error.message); // Clean up temp file on error try { execSync(`rm "${preprocessedFile}"`, { stdio: 'pipe' }); } catch { } process.exit(1); } } function main() { const config = parseArgs(); if (config.clean) { console.log('๐Ÿงน Cleaning output directory...'); cleanDirectory(config.output); } convertLatexToMarkdown(config.input, config.output); console.log('๐ŸŽ‰ Simple conversion completed!'); } // Show help if requested if (process.argv.includes('--help') || process.argv.includes('-h')) { console.log(` ๐Ÿš€ Simple LaTeX to Markdown Converter Usage: node scripts/simple-latex-to-markdown.mjs [options] Options: --input=PATH Input LaTeX file (default: latex-converter/input-example/main.tex) --output=PATH Output directory (default: output/) --clean Clean output directory before conversion --help, -h Show this help Examples: # Basic conversion node scripts/simple-latex-to-markdown.mjs # Custom paths node scripts/simple-latex-to-markdown.mjs --input=my-paper.tex --output=converted/ # Clean output first node scripts/simple-latex-to-markdown.mjs --clean `); process.exit(0); } main();