Writer

ywatanabe / test-project / / / / /
No commits yet
Blame
check_cited_states.py • 15.1 KB
Raw
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-11-11 (ywatanabe)"


r"""
Check citation states in manuscript - find uncited and missing references

Functionalities:
  - Extracts citation keys from .bib files
  - Finds all \cite commands in .tex files
  - Reports uncited references (in .bib but not cited)
  - Reports missing references (cited but not in .bib)
  - Reports successfully cited references
  - Supports various citation commands (\cite, \cite, \citet, etc.)
  - JSON output option for programmatic use
  - Proper logging with color-coded output

Dependencies:
  - packages:
    - pathlib
    - re
    - argparse
    - yaml
    - logging
    - json

IO:
  - input-files:
    - All .tex files in manuscript directory
    - All .bib files in bibliography directory

  - output-files:
    - Report to stdout or file (text or JSON format)
"""

import argparse
import json
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Set

import yaml
from _logging import getLogger


def load_config(config_path: Path = None) -> Dict[str, Any]:
    """Load configuration from YAML file.

    Args:
        config_path: Path to config file. If None, uses default location.

    Returns:
        Configuration dictionary
    """
    if config_path is None:
        # Default to ../config/config_manuscript.yaml relative to script location
        script_dir = Path(__file__).resolve().parent
        config_path = script_dir.parent.parent / "config" / "config_manuscript.yaml"

    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")

    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    return config


def extract_bib_keys(bib_path: Path) -> Set[str]:
    """Extract all citation keys from a .bib file.

    Args:
        bib_path: Path to .bib file

    Returns:
        Set of citation keys found in the file
    """
    if not bib_path.exists():
        return set()

    content = bib_path.read_text(encoding="utf-8")

    # Match @article{key, @book{key, etc.
    pattern = r"@\w+\s*\{\s*([^,\s]+)"
    keys = re.findall(pattern, content)

    return set(keys)


def extract_citations_from_tex(tex_path: Path) -> Set[str]:
    r"""Extract all citation keys from \cite commands in a .tex file.

    Args:
        tex_path: Path to .tex file

    Returns:
        Set of citation keys found in cite commands
    """
    if not tex_path.exists() or not tex_path.is_file():
        return set()

    content = tex_path.read_text(encoding="utf-8")

    # Remove comment lines
    lines = content.split("\n")
    lines = [line.split("%")[0] for line in lines]  # Remove inline comments
    content = "\n".join(lines)

    # Match various cite commands: \cite{key}, \cite{key1,key2}, etc.
    # Supports: cite, cite, citet, citealt, citealp, citeauthor, citeyear, etc.
    pattern = r"\\cite\w*\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}"
    matches = re.findall(pattern, content)

    # Split multiple citations and clean whitespace
    citations = set()
    for match in matches:
        keys = [k.strip() for k in match.split(",")]
        citations.update(keys)

    return citations


def find_all_tex_files(manuscript_dir: Path) -> List[Path]:
    """Find all .tex files in manuscript directory recursively.

    Args:
        manuscript_dir: Path to manuscript directory

    Returns:
        List of .tex file paths (excludes directories)
    """
    return [f for f in manuscript_dir.rglob("*.tex") if f.is_file()]


def find_all_bib_files(bib_dir: Path) -> List[Path]:
    """Find all .bib files in bibliography directory recursively.

    Args:
        bib_dir: Path to bibliography directory

    Returns:
        List of .bib file paths (excludes directories)
    """
    return [f for f in bib_dir.rglob("*.bib") if f.is_file()]


def generate_citation_data(
    all_bib_keys: Set[str],
    all_citations: Set[str],
    bib_files: List[Path],
    tex_files: List[Path],
) -> Dict[str, Any]:
    """Generate structured citation data.

    Args:
        all_bib_keys: All citation keys found in .bib files
        all_citations: All citations found in .tex files
        bib_files: List of .bib files processed
        tex_files: List of .tex files processed

    Returns:
        Dictionary with citation statistics and details
    """
    cited = sorted(all_bib_keys & all_citations)
    uncited = sorted(all_bib_keys - all_citations)
    missing = sorted(all_citations - all_bib_keys)

    return {
        "summary": {
            "total_references": len(all_bib_keys),
            "total_citations": len(all_citations),
            "successfully_cited": len(cited),
            "uncited": len(uncited),
            "missing": len(missing),
        },
        "details": {
            "successfully_cited": cited,
            "uncited_references": uncited,
            "missing_references": missing,
        },
        "files": {
            "bib_files": [str(f) for f in sorted(bib_files)],
            "tex_files": [str(f) for f in sorted(tex_files)],
        },
    }


def print_text_report(
    data: Dict[str, Any],
    logger: logging.Logger,
    show_details: bool = True,
    show_sections: Dict[str, bool] = None,
):
    """Print citation report to console using logger.

    Args:
        data: Citation data dictionary
        logger: Logger instance
        show_details: Show detailed lists of citations
        show_sections: Dictionary indicating which sections to show
    """
    if show_sections is None:
        show_sections = {"cited": True, "uncited": True, "missing": True}

    summary = data["summary"]
    details = data["details"]

    print("\n" + "=" * 80)
    print("CITATION STATUS REPORT")
    print("=" * 80 + "\n")

    # Summary (always show)
    print("SUMMARY")
    print("-" * 80)
    logger.info(f"Total references in .bib files: {summary['total_references']}")
    logger.info(f"Total citations in .tex files:  {summary['total_citations']}")
    logger.success(f"Successfully cited:              {summary['successfully_cited']}")

    if summary["uncited"] > 0:
        logger.warning(f"Uncited references:              {summary['uncited']}")
    else:
        logger.info(f"Uncited references:              {summary['uncited']}")

    if summary["missing"] > 0:
        logger.error(f"Missing references:              {summary['missing']}")
    else:
        logger.info(f"Missing references:              {summary['missing']}")

    print()

    if not show_details:
        print("=" * 80 + "\n")
        return

    # Successfully cited
    if show_sections.get("cited", False):
        print("SUCCESSFULLY CITED REFERENCES")
        print("-" * 80)
        if details["successfully_cited"]:
            for key in details["successfully_cited"]:
                logger.success(f"  ✓ {key}")
        else:
            logger.info("  (none)")
        print()

    # Uncited references
    if show_sections.get("uncited", False):
        print("UNCITED REFERENCES (in .bib but not cited in .tex)")
        print("-" * 80)
        if details["uncited_references"]:
            for key in details["uncited_references"]:
                logger.warning(f"  ⚠ {key}")
        else:
            logger.success("  ✓ All references are cited")
        print()

    # Missing references
    if show_sections.get("missing", False):
        print("MISSING REFERENCES (cited in .tex but not in .bib)")
        print("-" * 80)
        if details["missing_references"]:
            for key in details["missing_references"]:
                logger.error(f"  ✗ {key}")
        else:
            logger.success("  ✓ All citations have references")
        print()

    print("=" * 80 + "\n")


def main():
    parser = argparse.ArgumentParser(
        description="Check citation states in manuscript",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Check all citation states (default: show all)
  %(prog)s

  # Show only successfully cited references
  %(prog)s --bib-cited

  # Show only uncited references (in .bib but not cited)
  %(prog)s --bib-uncited

  # Show only missing references (cited but not in .bib)
  %(prog)s --bib-missing

  # Show summary only (no detailed lists)
  %(prog)s --summary-only

  # Output as JSON
  %(prog)s --json

  # Save to custom file
  %(prog)s --output citation_report.txt

  # Verbose logging
  %(prog)s --verbose
        """,
    )

    # Filter options (if none specified, show all)
    parser.add_argument(
        "--bib-cited",
        action="store_true",
        help="Show only successfully cited references",
    )
    parser.add_argument(
        "--bib-uncited",
        action="store_true",
        help="Show only uncited references (in .bib but not cited in .tex)",
    )
    parser.add_argument(
        "--bib-missing",
        action="store_true",
        help="Show only missing references (cited in .tex but not in .bib)",
    )

    # Output options
    parser.add_argument(
        "--summary-only",
        action="store_true",
        help="Show only summary statistics (no detailed lists)",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Output in JSON format (default output: 00_shared/bib_files/cited_states.json)",
    )
    parser.add_argument(
        "--config",
        type=Path,
        help="Path to config YAML file (default: auto-detect from script location)",
    )
    parser.add_argument(
        "--output",
        "-o",
        type=Path,
        help="Output file path (default: 00_shared/bib_files/cited_states.json for JSON, stdout for text)",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable verbose logging",
    )

    args = parser.parse_args()

    # Setup logger
    logger = getLogger(__name__, args.verbose)

    # Determine what to show based on filter flags
    filter_flags = {
        "cited": args.bib_cited,
        "uncited": args.bib_uncited,
        "missing": args.bib_missing,
    }

    # If any filter is specified, show only those
    # Otherwise, show all
    if any(filter_flags.values()):
        show_sections = {
            name: enabled for name, enabled in filter_flags.items() if enabled
        }
    else:
        show_sections = {"cited": True, "uncited": True, "missing": True}

    # Load configuration
    try:
        load_config(args.config)
    except FileNotFoundError as e:
        logger.error(str(e))
        return 1

    # Get project root
    if args.config:
        project_root = args.config.parent.parent
    else:
        script_dir = Path(__file__).resolve().parent
        project_root = script_dir.parent.parent

    # Find all .bib and .tex files
    bib_dir = project_root / "00_shared"
    manuscript_dir = project_root / "01_manuscript"

    logger.info("Scanning files...")

    bib_files = find_all_bib_files(bib_dir)
    tex_files = find_all_tex_files(manuscript_dir)

    if not bib_files:
        logger.warning(f"No .bib files found in {bib_dir}")
    if not tex_files:
        logger.warning(f"No .tex files found in {manuscript_dir}")

    logger.debug(f"Found {len(bib_files)} .bib files")
    logger.debug(f"Found {len(tex_files)} .tex files")

    # Extract all citation keys from .bib files
    all_bib_keys = set()
    for bib_file in bib_files:
        keys = extract_bib_keys(bib_file)
        all_bib_keys.update(keys)
        logger.debug(f"  {bib_file.name}: {len(keys)} references")

    # Extract all citations from .tex files
    all_citations = set()
    for tex_file in tex_files:
        citations = extract_citations_from_tex(tex_file)
        all_citations.update(citations)
        if citations:
            logger.debug(f"  {tex_file.name}: {len(citations)} citations")

    # Generate citation data
    data = generate_citation_data(all_bib_keys, all_citations, bib_files, tex_files)

    # Always save JSON to default location unless custom output specified
    default_json_path = project_root / "00_shared" / "bib_files" / "cited_states.json"
    default_json_path.parent.mkdir(parents=True, exist_ok=True)

    # Save JSON data to default location (always)
    if not args.output:
        json_str = json.dumps(data, indent=2, ensure_ascii=False)
        default_json_path.write_text(json_str, encoding="utf-8")

    # Output
    if args.json:
        # JSON output to stdout or custom file
        if args.output:
            json_str = json.dumps(data, indent=2, ensure_ascii=False)
            args.output.write_text(json_str, encoding="utf-8")
            logger.success(f"Saved to: {args.output}")
        else:
            # Already saved to default location, just print
            print(json.dumps(data, indent=2, ensure_ascii=False))
            logger.success(f"Saved to: {default_json_path}")
    else:
        # Text output
        if args.output:
            # For text file output, generate plain text without colors
            with open(args.output, "w", encoding="utf-8") as f:
                f.write("=" * 80 + "\n")
                f.write("CITATION STATUS REPORT\n")
                f.write("=" * 80 + "\n\n")
                f.write("SUMMARY\n")
                f.write("-" * 80 + "\n")
                f.write(
                    f"Total references in .bib files: {data['summary']['total_references']}\n"
                )
                f.write(
                    f"Total citations in .tex files:  {data['summary']['total_citations']}\n"
                )
                f.write(
                    f"Successfully cited:              {data['summary']['successfully_cited']}\n"
                )
                f.write(
                    f"Uncited references:              {data['summary']['uncited']}\n"
                )
                f.write(
                    f"Missing references:              {data['summary']['missing']}\n\n"
                )

                if not args.summary_only:
                    f.write("SUCCESSFULLY CITED REFERENCES\n")
                    f.write("-" * 80 + "\n")
                    for key in data["details"]["successfully_cited"]:
                        f.write(f"  ✓ {key}\n")
                    f.write("\n")

                    f.write("UNCITED REFERENCES (in .bib but not cited in .tex)\n")
                    f.write("-" * 80 + "\n")
                    for key in data["details"]["uncited_references"]:
                        f.write(f"  ⚠ {key}\n")
                    f.write("\n")

                    f.write("MISSING REFERENCES (cited in .tex but not in .bib)\n")
                    f.write("-" * 80 + "\n")
                    for key in data["details"]["missing_references"]:
                        f.write(f"  ✗ {key}\n")
                    f.write("\n")

                f.write("=" * 80 + "\n")

            logger.success(f"Saved to: {args.output}")
        else:
            # Console output with colors (JSON already saved to default location)
            print_text_report(
                data,
                logger,
                show_details=not args.summary_only,
                show_sections=show_sections,
            )
            logger.success(f"Saved to: {default_json_path}")

    # Return exit code based on results
    if data["summary"]["missing"] > 0:
        return 1  # Error: missing references
    return 0


if __name__ == "__main__":
    main()

# EOF