Loading...
No commits yet
Not committed History
Blame
csv_to_latex.py • 8.6 KB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2024-09-28 18:20:00 (ywatanabe)"
# File: csv_to_latex.py

"""
Robust CSV to LaTeX table converter with proper escaping and formatting.

Dependencies:
    - pandas
    - numpy

Usage:
    python csv_to_latex.py input.csv output.tex [--caption "caption text"]
"""

import argparse
import re
import sys
from pathlib import Path

import pandas as pd


def escape_latex(text):
    """Properly escape special LaTeX characters."""
    if pd.isna(text):
        return ""

    # Convert to string if not already
    text = str(text)

    # Order matters - backslash must be first
    replacements = [
        ("\\", r"\textbackslash{}"),
        ("&", r"\&"),
        ("%", r"\%"),
        ("$", r"\$"),
        ("#", r"\#"),
        ("_", r"\_"),
        ("{", r"\{"),
        ("}", r"\}"),
        ("~", r"\textasciitilde{}"),
        ("^", r"\textasciicircum{}"),
        ("|", r"\textbar{}"),
        ("<", r"\textless{}"),
        (">", r"\textgreater{}"),
    ]

    for old, new in replacements:
        text = text.replace(old, new)

    return text


def format_number(val):
    """Format numbers appropriately for LaTeX."""
    try:
        # Try to convert to float
        num = float(val)

        # Check if it's actually an integer
        if num.is_integer():
            return str(int(num))
        else:
            # Format with appropriate decimal places
            if abs(num) < 0.01 and num != 0:
                # Scientific notation for very small numbers
                return f"{num:.2e}"
            else:
                # Regular decimal notation
                return f"{num:.3f}".rstrip("0").rstrip(".")
    except (ValueError, TypeError):
        # Not a number, return as is
        return val


def csv_to_latex(csv_file, output_file, caption=None, label=None, max_rows=30):
    """Convert CSV to LaTeX table with proper formatting.

    Args:
        csv_file: Input CSV file path
        output_file: Output LaTeX file path
        caption: Optional table caption
        label: Optional table label for references
        max_rows: Maximum number of data rows to display (default: 30)
    """

    # Read CSV with pandas for robust parsing
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        print(f"Error reading CSV: {e}", file=sys.stderr)
        return False

    # Store original row count for truncation message
    original_rows = len(df)
    truncated = False

    # Truncate if necessary
    if len(df) > max_rows:
        truncated = True
        # Keep first N-3 rows and last 2 rows with separator
        if max_rows > 5:
            df_top = df.head(max_rows - 2)
            df_bottom = df.tail(2)
            # Create separator row with "..." in each column
            separator = pd.DataFrame([["..." for _ in df.columns]], columns=df.columns)
            df = pd.concat([df_top, separator, df_bottom], ignore_index=True)
        else:
            df = df.head(max_rows)

    # Extract metadata from filename
    csv_path = Path(csv_file)
    base_name = csv_path.stem

    # Extract table number if present
    table_number = ""
    table_name = base_name
    match = re.match(r"^(\d+)_(.*)$", base_name)
    if match:
        table_number = match.group(1).lstrip("0")
        table_name = match.group(2).replace("_", " ")

    # Determine column alignment
    alignments = []
    for col in df.columns:
        # Check if column is numeric
        try:
            pd.to_numeric(df[col], errors="raise")
            alignments.append("r")  # Right align for numbers
        except:
            alignments.append("l")  # Left align for text

    # Start building LaTeX
    lines = []

    # Table environment
    lines.append(f"\\pdfbookmark[2]{{Table {table_number}}}{{table_{base_name}}}")
    lines.append("\\begin{table}[htbp]")
    lines.append("\\centering")

    # Use standard font size for tables
    # Standard academic paper convention: \footnotesize (8pt) for tables
    lines.append("\\footnotesize")

    # Adjust tabcolsep based on number of columns to fit width
    num_columns = len(df.columns)
    if num_columns > 8:
        lines.append("\\setlength{\\tabcolsep}{2pt}")  # Very tight for many columns
    elif num_columns > 6:
        lines.append("\\setlength{\\tabcolsep}{3pt}")  # Tight spacing
    elif num_columns > 4:
        lines.append("\\setlength{\\tabcolsep}{4pt}")  # Medium spacing
    else:
        lines.append("\\setlength{\\tabcolsep}{6pt}")  # Normal spacing

    # Use resizebox to ensure table fits within text width
    lines.append("\\resizebox{\\textwidth}{!}{%")

    # Begin tabular
    tabular_spec = "".join(alignments)
    lines.append(f"\\begin{{tabular}}{{{tabular_spec}}}")
    lines.append("\\toprule")

    # Header row
    headers = []
    for col in df.columns:
        # Format header
        header = escape_latex(col)
        # Remove underscores and capitalize
        header = header.replace("\\_", " ").title()
        headers.append(f"\\textbf{{{header}}}")
    lines.append(" & ".join(headers) + " \\\\")
    lines.append("\\midrule")

    # Data rows
    for idx, row in df.iterrows():
        values = []
        is_separator = False

        for col in df.columns:
            val = row[col]

            # Check if this is the separator row
            if str(val) == "...":
                is_separator = True

            # Format the value
            if pd.notna(val):
                if not is_separator:
                    val = format_number(val)
                    val = escape_latex(val)
            else:
                val = "--"  # Display for missing values

            values.append(val)

        # Don't add row coloring for separator
        if is_separator:
            lines.append("\\midrule")
            lines.append(
                "\\multicolumn{"
                + str(len(df.columns))
                + "}{c}{\\textit{... "
                + f"{original_rows - max_rows + 1} rows omitted ..."
                + "}} \\\\"
            )
            lines.append("\\midrule")
        else:
            # Add row coloring for readability (skip separator in count)
            if idx % 2 == 1:
                lines.append("\\rowcolor{gray!10}")
            lines.append(" & ".join(values) + " \\\\")

    lines.append("\\bottomrule")
    lines.append("\\end{tabular}")
    lines.append("}")  # Close resizebox

    # Caption
    lines.append("\\captionsetup{width=\\textwidth}")
    if caption:
        # Add truncation note to existing caption if needed
        if truncated:
            caption = caption.rstrip("}")
            caption += f" \\textit{{Note: Table truncated to {max_rows} rows from {original_rows} total rows for display purposes.}}"
            caption += "}"
        lines.append(caption)
    else:
        # Generate default caption
        if table_number:
            lines.append(
                f"\\caption{{\\textbf{{Table {table_number}: {table_name.title()}}}"
            )
        else:
            lines.append(f"\\caption{{\\textbf{{{table_name.title()}}}")
        lines.append("\\\\")
        if truncated:
            lines.append(
                f"\\textit{{Note: Table truncated to {max_rows} rows from {original_rows} total rows for display purposes.}}"
            )
        else:
            lines.append("Data table generated from CSV file.")
        lines.append("}")

    # Label
    if label:
        lines.append(f"\\label{{{label}}}")
    else:
        lines.append(f"\\label{{tab:{base_name}}}")

    lines.append("\\end{table}")
    lines.append("")
    lines.append("\\restoregeometry")

    # Write to file
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write("\n".join(lines))
        return True
    except Exception as e:
        print(f"Error writing output: {e}", file=sys.stderr)
        return False


def main():
    parser = argparse.ArgumentParser(description="Convert CSV to LaTeX table")
    parser.add_argument("input_csv", help="Input CSV file")
    parser.add_argument("output_tex", help="Output LaTeX file")
    parser.add_argument("--caption", help="Custom caption text")
    parser.add_argument("--caption-file", help="File containing caption text")
    parser.add_argument("--label", help="Custom label for referencing")

    args = parser.parse_args()

    # Read caption from file if provided
    caption = args.caption
    if args.caption_file and Path(args.caption_file).exists():
        with open(args.caption_file, "r", encoding="utf-8") as f:
            caption = f.read().strip()

    success = csv_to_latex(
        args.input_csv, args.output_tex, caption=caption, label=args.label
    )

    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()