Loading...
No commits yet
Not committed History
Blame
check_float_order.py • 13.2 KB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File: scripts/python/check_float_order.py
# Purpose: Validate and auto-renumber figure/table reference ordering in LaTeX manuscripts
# Usage:
#   python check_float_order.py [project_dir] [--fix] [--doc-type manuscript|supplementary]
#
# Checks that figures and tables are referenced in numerical order in the text.
# With --fix, renumbers files and updates all \ref{} and \label{} to match appearance order.

import argparse
import os
import re
import shutil
import sys
from collections import OrderedDict
from pathlib import Path

# ANSI colors
GREEN = "\033[0;32m"
YELLOW = "\033[1;33m"
RED = "\033[0;31m"
DIM = "\033[0;90m"
BOLD = "\033[1m"
NC = "\033[0m"


def find_references(content_dir, float_type):
    """Find all \\ref{fig:*} or \\ref{tab:*} in text files, in order of appearance.

    Parameters
    ----------
    content_dir : Path
        Directory containing .tex content files.
    float_type : str
        'fig' or 'tab'.

    Returns
    -------
    list of tuple
        (label_key, file, line_number) in order of first appearance.
    """
    # Read order: follow typical IMRAD structure
    section_order = [
        "abstract",
        "introduction",
        "results",
        "methods",
        "star_methods",
        "discussion",
        "additional_info",
        "data_availability",
        "bigger_picture",
        "highlights",
        "graphical_abstract",
    ]

    tex_files = []
    for name in section_order:
        p = content_dir / f"{name}.tex"
        if p.exists():
            tex_files.append(p)

    # Add any remaining .tex files not in the order list
    for p in sorted(content_dir.glob("*.tex")):
        if p not in tex_files:
            tex_files.append(p)

    pattern = re.compile(r"\\ref\{" + float_type + r":([^}]+)\}")
    seen = OrderedDict()
    refs = []

    for tex_file in tex_files:
        text = tex_file.read_text(encoding="utf-8")
        for line_no, line in enumerate(text.splitlines(), 1):
            for m in pattern.finditer(line):
                key = m.group(1)
                if key not in seen:
                    seen[key] = (tex_file.name, line_no)
                    refs.append((key, tex_file.name, line_no))

    return refs


def find_labels(content_dir, float_type):
    """Find all \\label{fig:*} or \\label{tab:*} definitions.

    Searches both caption files in figures/tables subdirs and inline in content .tex files.

    Returns
    -------
    dict
        label_key -> {'file': Path, 'line': int, 'source': 'caption_file'|'inline'}
    """
    labels = {}
    prefix = float_type

    # Search caption files
    if float_type == "fig":
        media_dir = content_dir / "figures" / "caption_and_media"
    else:
        media_dir = content_dir / "tables" / "caption_and_media"

    if media_dir.exists():
        for tex_file in media_dir.glob("[0-9]*.tex"):
            text = tex_file.read_text(encoding="utf-8")
            for line_no, line in enumerate(text.splitlines(), 1):
                m = re.search(r"\\label\{" + prefix + r":([^}]+)\}", line)
                if m:
                    labels[m.group(1)] = {
                        "file": tex_file,
                        "line": line_no,
                        "source": "caption_file",
                    }

    # Search inline in content .tex files
    for tex_file in content_dir.glob("*.tex"):
        text = tex_file.read_text(encoding="utf-8")
        for line_no, line in enumerate(text.splitlines(), 1):
            m = re.search(r"\\label\{" + prefix + r":([^}]+)\}", line)
            if m:
                key = m.group(1)
                if key not in labels:
                    labels[key] = {
                        "file": tex_file,
                        "line": line_no,
                        "source": "inline",
                    }

    return labels


def extract_number_and_name(key):
    """Extract numeric prefix and descriptive name from a key like '04_modules'.

    Returns (4, 'modules') or (None, key) if no numeric prefix.
    """
    m = re.match(r"^(\d+)_(.+)$", key)
    if m:
        return int(m.group(1)), m.group(2)
    m = re.match(r"^(\d+)$", key)
    if m:
        return int(m.group(1)), ""
    return None, key


def check_order(content_dir, float_type, label):
    """Check if references appear in numerical order.

    Returns
    -------
    tuple
        (is_ok, refs, desired_mapping)
        refs: list of (key, file, line)
        desired_mapping: dict of old_key -> new_key if reordering needed
    """
    refs = find_references(content_dir, float_type)
    labels = find_labels(content_dir, float_type)

    if not refs:
        print(f"  {GREEN}[PASS]{NC} {label} references - none found")
        return True, refs, {}

    # Check if numbered keys appear in order
    numbered_refs = []
    for key, fname, line in refs:
        num, name = extract_number_and_name(key)
        if num is not None:
            numbered_refs.append((num, name, key, fname, line))

    if not numbered_refs:
        print(f"  {GREEN}[PASS]{NC} {label} references - no numbered references")
        return True, refs, {}

    # Check sequential ordering
    numbers = [n for n, _, _, _, _ in numbered_refs]
    is_sequential = all(numbers[i] < numbers[i + 1] for i in range(len(numbers) - 1))

    if is_sequential:
        # Check if numbering starts at 01 and is contiguous
        expected = list(range(1, len(numbered_refs) + 1))
        actual = numbers
        if actual == expected:
            print(
                f"  {GREEN}[PASS]{NC} {label} reference order (1..{len(numbered_refs)})"
            )
        else:
            print(
                f"  {YELLOW}[WARN]{NC} {label} references sequential but not contiguous: {numbers}"
            )
        return True, refs, {}

    # Out of order - build mapping
    print(f"  {RED}[FAIL]{NC} {label} reference order")
    desired_mapping = {}
    for new_num_0, (old_num, name, old_key, fname, line) in enumerate(numbered_refs):
        new_num = new_num_0 + 1
        new_key = f"{new_num:02d}_{name}" if name else f"{new_num:02d}"
        if old_key != new_key:
            desired_mapping[old_key] = new_key
        print(
            f"    {DIM}{fname}:{line}: "
            f"\\ref{{{float_type}:{old_key}}} -> should be {new_num:02d}{NC}"
        )

    # Report orphaned labels (defined but never referenced)
    ref_keys = {key for key, _, _ in refs}
    for label_key, info in labels.items():
        if label_key not in ref_keys:
            print(
                f"    {YELLOW}[WARN]{NC} \\label{{{float_type}:{label_key}}} "
                f"defined in {info['file'].name}:{info['line']} but never referenced"
            )

    return False, refs, desired_mapping


def apply_fix(content_dir, float_type, mapping, dry_run=False):
    """Rename files and update all \\ref{} and \\label{} in place.

    Parameters
    ----------
    content_dir : Path
    float_type : str
        'fig' or 'tab'
    mapping : dict
        old_key -> new_key
    dry_run : bool
        If True, only print what would be done.
    """
    if not mapping:
        return

    prefix = float_type

    # 1. Collect all .tex files that may contain references
    all_tex_files = list(content_dir.glob("*.tex"))
    if float_type == "fig":
        media_dir = content_dir / "figures" / "caption_and_media"
    else:
        media_dir = content_dir / "tables" / "caption_and_media"

    if media_dir.exists():
        all_tex_files.extend(media_dir.glob("*.tex"))

    # Also check the parent manuscript .tex files
    doc_dir = content_dir.parent
    for tex in doc_dir.glob("*.tex"):
        all_tex_files.append(tex)

    all_tex_files = list(set(all_tex_files))

    # 2. Text replacements in all .tex files
    # Use intermediate keys to avoid collision (old_key -> __TEMP_N__ -> new_key)
    temp_keys = {old: f"__REORDER_TEMP_{i}__" for i, old in enumerate(mapping)}

    for tex_file in all_tex_files:
        text = tex_file.read_text(encoding="utf-8")
        original = text

        # Pass 1: old -> temp
        for old_key, temp_key in temp_keys.items():
            text = text.replace(f"{prefix}:{old_key}", f"{prefix}:{temp_key}")

        # Pass 2: temp -> new
        for old_key, new_key in mapping.items():
            temp_key = temp_keys[old_key]
            text = text.replace(f"{prefix}:{temp_key}", f"{prefix}:{new_key}")

        if text != original:
            action = "Would update" if dry_run else "Updated"
            print(f"    {action} references in {tex_file.name}")
            if not dry_run:
                tex_file.write_text(text, encoding="utf-8")

    # 3. Rename media files (caption .tex, images, CSV, etc.)
    if not media_dir or not media_dir.exists():
        return

    # Build file rename map using temp names to avoid collision
    renames = []  # (old_path, temp_path, new_path)

    for old_key, new_key in mapping.items():
        old_prefix_str = old_key  # e.g., "04_modules"
        new_prefix_str = new_key  # e.g., "01_modules"

        for f in media_dir.iterdir():
            if f.is_dir():
                continue
            fname = f.name
            stem = f.stem

            if stem == old_prefix_str or fname.startswith(old_prefix_str + "."):
                new_name = fname.replace(old_prefix_str, new_prefix_str, 1)
                temp_name = fname.replace(old_prefix_str, f"__TEMP_{old_key}__", 1)
                renames.append((f, media_dir / temp_name, media_dir / new_name))

        # Also check symlinks in jpg_for_compilation
        jpg_dir = media_dir / "jpg_for_compilation"
        if jpg_dir.exists():
            for f in jpg_dir.iterdir():
                fname = f.name
                if fname.startswith(old_prefix_str + ".") or f.stem == old_prefix_str:
                    new_name = fname.replace(old_prefix_str, new_prefix_str, 1)
                    temp_name = fname.replace(old_prefix_str, f"__TEMP_{old_key}__", 1)
                    renames.append((f, jpg_dir / temp_name, jpg_dir / new_name))

    if renames:
        # Pass 1: old -> temp
        for old_path, temp_path, _ in renames:
            if old_path.exists():
                action = "Would rename" if dry_run else "Renamed"
                if not dry_run:
                    old_path.rename(temp_path)

        # Pass 2: temp -> new
        for _, temp_path, new_path in renames:
            if temp_path.exists():
                action = "Would rename" if dry_run else "Renamed"
                print(f"    {action} {temp_path.parent.name}/{new_path.name}")
                if not dry_run:
                    temp_path.rename(new_path)


def main():
    parser = argparse.ArgumentParser(
        description="Check and fix figure/table reference ordering in LaTeX manuscripts"
    )
    parser.add_argument(
        "project_dir",
        nargs="?",
        default=".",
        help="Project root directory (default: current directory)",
    )
    parser.add_argument(
        "--fix",
        action="store_true",
        help="Auto-renumber files and update references to match appearance order",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what --fix would do without making changes",
    )
    parser.add_argument(
        "--doc-type",
        choices=["manuscript", "supplementary", "all"],
        default="all",
        help="Which document type to check (default: all)",
    )
    args = parser.parse_args()

    project_dir = Path(args.project_dir).resolve()

    doc_types = []
    if args.doc_type in ("manuscript", "all"):
        d = project_dir / "01_manuscript" / "contents"
        if d.exists():
            doc_types.append(("manuscript", d))
    if args.doc_type in ("supplementary", "all"):
        d = project_dir / "02_supplementary" / "contents"
        if d.exists():
            doc_types.append(("supplementary", d))

    if not doc_types:
        print(f"{RED}No content directories found in {project_dir}{NC}")
        sys.exit(1)

    print(f"\n{BOLD}=== Float Reference Order Check ==={NC}\n")

    has_issues = False
    all_mappings = []

    for doc_label, content_dir in doc_types:
        for float_type, float_label in [("fig", "Figure"), ("tab", "Table")]:
            label = f"{float_label} ({doc_label})"
            ok, refs, mapping = check_order(content_dir, float_type, label)
            if not ok:
                has_issues = True
                if mapping:
                    all_mappings.append((content_dir, float_type, mapping, label))

    print()

    if not has_issues:
        print(f"{GREEN}All float references are in order.{NC}")
        return 0

    if args.fix or args.dry_run:
        mode = "DRY RUN" if args.dry_run else "FIXING"
        print(f"{BOLD}--- {mode} ---{NC}\n")
        for content_dir, float_type, mapping, label in all_mappings:
            print(f"  {label}: renumbering {len(mapping)} items")
            for old, new in mapping.items():
                print(f"    {old} -> {new}")
            apply_fix(content_dir, float_type, mapping, dry_run=args.dry_run)
            print()

        if not args.dry_run:
            print(f"{GREEN}Renumbering complete. Re-run to verify.{NC}")
        return 0
    else:
        print(f"{YELLOW}Run with --fix to auto-renumber, or --dry-run to preview.{NC}")
        return 1


if __name__ == "__main__":
    sys.exit(main())