#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2024-09-28 18:20:00 (ywatanabe)"
# File: csv_to_latex.py
"""
Robust CSV to LaTeX table converter with proper escaping and formatting.
Dependencies:
- pandas
- numpy
Usage:
python csv_to_latex.py input.csv output.tex [--caption "caption text"]
"""
import argparse
import re
import sys
from pathlib import Path
import pandas as pd
def escape_latex(text):
"""Properly escape special LaTeX characters."""
if pd.isna(text):
return ""
# Convert to string if not already
text = str(text)
# Order matters - backslash must be first
replacements = [
("\\", r"\textbackslash{}"),
("&", r"\&"),
("%", r"\%"),
("$", r"\$"),
("#", r"\#"),
("_", r"\_"),
("{", r"\{"),
("}", r"\}"),
("~", r"\textasciitilde{}"),
("^", r"\textasciicircum{}"),
("|", r"\textbar{}"),
("<", r"\textless{}"),
(">", r"\textgreater{}"),
]
for old, new in replacements:
text = text.replace(old, new)
return text
def format_number(val):
"""Format numbers appropriately for LaTeX."""
try:
# Try to convert to float
num = float(val)
# Check if it's actually an integer
if num.is_integer():
return str(int(num))
else:
# Format with appropriate decimal places
if abs(num) < 0.01 and num != 0:
# Scientific notation for very small numbers
return f"{num:.2e}"
else:
# Regular decimal notation
return f"{num:.3f}".rstrip("0").rstrip(".")
except (ValueError, TypeError):
# Not a number, return as is
return val
def csv_to_latex(csv_file, output_file, caption=None, label=None, max_rows=30):
"""Convert CSV to LaTeX table with proper formatting.
Args:
csv_file: Input CSV file path
output_file: Output LaTeX file path
caption: Optional table caption
label: Optional table label for references
max_rows: Maximum number of data rows to display (default: 30)
"""
# Read CSV with pandas for robust parsing
try:
df = pd.read_csv(csv_file)
except Exception as e:
print(f"Error reading CSV: {e}", file=sys.stderr)
return False
# Store original row count for truncation message
original_rows = len(df)
truncated = False
# Truncate if necessary
if len(df) > max_rows:
truncated = True
# Keep first N-3 rows and last 2 rows with separator
if max_rows > 5:
df_top = df.head(max_rows - 2)
df_bottom = df.tail(2)
# Create separator row with "..." in each column
separator = pd.DataFrame([["..." for _ in df.columns]], columns=df.columns)
df = pd.concat([df_top, separator, df_bottom], ignore_index=True)
else:
df = df.head(max_rows)
# Extract metadata from filename
csv_path = Path(csv_file)
base_name = csv_path.stem
# Extract table number if present
table_number = ""
table_name = base_name
match = re.match(r"^(\d+)_(.*)$", base_name)
if match:
table_number = match.group(1).lstrip("0")
table_name = match.group(2).replace("_", " ")
# Determine column alignment
alignments = []
for col in df.columns:
# Check if column is numeric
try:
pd.to_numeric(df[col], errors="raise")
alignments.append("r") # Right align for numbers
except:
alignments.append("l") # Left align for text
# Start building LaTeX
lines = []
# Table environment
lines.append(f"\\pdfbookmark[2]{{Table {table_number}}}{{table_{base_name}}}")
lines.append("\\begin{table}[htbp]")
lines.append("\\centering")
# Use standard font size for tables
# Standard academic paper convention: \footnotesize (8pt) for tables
lines.append("\\footnotesize")
# Adjust tabcolsep based on number of columns to fit width
num_columns = len(df.columns)
if num_columns > 8:
lines.append("\\setlength{\\tabcolsep}{2pt}") # Very tight for many columns
elif num_columns > 6:
lines.append("\\setlength{\\tabcolsep}{3pt}") # Tight spacing
elif num_columns > 4:
lines.append("\\setlength{\\tabcolsep}{4pt}") # Medium spacing
else:
lines.append("\\setlength{\\tabcolsep}{6pt}") # Normal spacing
# Use resizebox to ensure table fits within text width
lines.append("\\resizebox{\\textwidth}{!}{%")
# Begin tabular
tabular_spec = "".join(alignments)
lines.append(f"\\begin{{tabular}}{{{tabular_spec}}}")
lines.append("\\toprule")
# Header row
headers = []
for col in df.columns:
# Format header
header = escape_latex(col)
# Remove underscores and capitalize
header = header.replace("\\_", " ").title()
headers.append(f"\\textbf{{{header}}}")
lines.append(" & ".join(headers) + " \\\\")
lines.append("\\midrule")
# Data rows
for idx, row in df.iterrows():
values = []
is_separator = False
for col in df.columns:
val = row[col]
# Check if this is the separator row
if str(val) == "...":
is_separator = True
# Format the value
if pd.notna(val):
if not is_separator:
val = format_number(val)
val = escape_latex(val)
else:
val = "--" # Display for missing values
values.append(val)
# Don't add row coloring for separator
if is_separator:
lines.append("\\midrule")
lines.append(
"\\multicolumn{"
+ str(len(df.columns))
+ "}{c}{\\textit{... "
+ f"{original_rows - max_rows + 1} rows omitted ..."
+ "}} \\\\"
)
lines.append("\\midrule")
else:
# Add row coloring for readability (skip separator in count)
if idx % 2 == 1:
lines.append("\\rowcolor{gray!10}")
lines.append(" & ".join(values) + " \\\\")
lines.append("\\bottomrule")
lines.append("\\end{tabular}")
lines.append("}") # Close resizebox
# Caption
lines.append("\\captionsetup{width=\\textwidth}")
if caption:
# Add truncation note to existing caption if needed
if truncated:
caption = caption.rstrip("}")
caption += f" \\textit{{Note: Table truncated to {max_rows} rows from {original_rows} total rows for display purposes.}}"
caption += "}"
lines.append(caption)
else:
# Generate default caption
if table_number:
lines.append(
f"\\caption{{\\textbf{{Table {table_number}: {table_name.title()}}}"
)
else:
lines.append(f"\\caption{{\\textbf{{{table_name.title()}}}")
lines.append("\\\\")
if truncated:
lines.append(
f"\\textit{{Note: Table truncated to {max_rows} rows from {original_rows} total rows for display purposes.}}"
)
else:
lines.append("Data table generated from CSV file.")
lines.append("}")
# Label
if label:
lines.append(f"\\label{{{label}}}")
else:
lines.append(f"\\label{{tab:{base_name}}}")
lines.append("\\end{table}")
lines.append("")
lines.append("\\restoregeometry")
# Write to file
try:
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
return True
except Exception as e:
print(f"Error writing output: {e}", file=sys.stderr)
return False
def main():
parser = argparse.ArgumentParser(description="Convert CSV to LaTeX table")
parser.add_argument("input_csv", help="Input CSV file")
parser.add_argument("output_tex", help="Output LaTeX file")
parser.add_argument("--caption", help="Custom caption text")
parser.add_argument("--caption-file", help="File containing caption text")
parser.add_argument("--label", help="Custom label for referencing")
args = parser.parse_args()
# Read caption from file if provided
caption = args.caption
if args.caption_file and Path(args.caption_file).exists():
with open(args.caption_file, "r", encoding="utf-8") as f:
caption = f.read().strip()
success = csv_to_latex(
args.input_csv, args.output_tex, caption=caption, label=args.label
)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()