#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-11-11 (ywatanabe)"
r"""
Check citation states in manuscript - find uncited and missing references
Functionalities:
- Extracts citation keys from .bib files
- Finds all \cite commands in .tex files
- Reports uncited references (in .bib but not cited)
- Reports missing references (cited but not in .bib)
- Reports successfully cited references
- Supports various citation commands (\cite, \cite, \citet, etc.)
- JSON output option for programmatic use
- Proper logging with color-coded output
Dependencies:
- packages:
- pathlib
- re
- argparse
- yaml
- logging
- json
IO:
- input-files:
- All .tex files in manuscript directory
- All .bib files in bibliography directory
- output-files:
- Report to stdout or file (text or JSON format)
"""
import argparse
import json
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Set
import yaml
from _logging import getLogger
def load_config(config_path: Path = None) -> Dict[str, Any]:
"""Load configuration from YAML file.
Args:
config_path: Path to config file. If None, uses default location.
Returns:
Configuration dictionary
"""
if config_path is None:
# Default to ../config/config_manuscript.yaml relative to script location
script_dir = Path(__file__).resolve().parent
config_path = script_dir.parent.parent / "config" / "config_manuscript.yaml"
if not config_path.exists():
raise FileNotFoundError(f"Config file not found: {config_path}")
with open(config_path, "r") as f:
config = yaml.safe_load(f)
return config
def extract_bib_keys(bib_path: Path) -> Set[str]:
"""Extract all citation keys from a .bib file.
Args:
bib_path: Path to .bib file
Returns:
Set of citation keys found in the file
"""
if not bib_path.exists():
return set()
content = bib_path.read_text(encoding="utf-8")
# Match @article{key, @book{key, etc.
pattern = r"@\w+\s*\{\s*([^,\s]+)"
keys = re.findall(pattern, content)
return set(keys)
def extract_citations_from_tex(tex_path: Path) -> Set[str]:
r"""Extract all citation keys from \cite commands in a .tex file.
Args:
tex_path: Path to .tex file
Returns:
Set of citation keys found in cite commands
"""
if not tex_path.exists() or not tex_path.is_file():
return set()
content = tex_path.read_text(encoding="utf-8")
# Remove comment lines
lines = content.split("\n")
lines = [line.split("%")[0] for line in lines] # Remove inline comments
content = "\n".join(lines)
# Match various cite commands: \cite{key}, \cite{key1,key2}, etc.
# Supports: cite, cite, citet, citealt, citealp, citeauthor, citeyear, etc.
pattern = r"\\cite\w*\s*(?:\[[^\]]*\])?\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}"
matches = re.findall(pattern, content)
# Split multiple citations and clean whitespace
citations = set()
for match in matches:
keys = [k.strip() for k in match.split(",")]
citations.update(keys)
return citations
def find_all_tex_files(manuscript_dir: Path) -> List[Path]:
"""Find all .tex files in manuscript directory recursively.
Args:
manuscript_dir: Path to manuscript directory
Returns:
List of .tex file paths (excludes directories)
"""
return [f for f in manuscript_dir.rglob("*.tex") if f.is_file()]
def find_all_bib_files(bib_dir: Path) -> List[Path]:
"""Find all .bib files in bibliography directory recursively.
Args:
bib_dir: Path to bibliography directory
Returns:
List of .bib file paths (excludes directories)
"""
return [f for f in bib_dir.rglob("*.bib") if f.is_file()]
def generate_citation_data(
all_bib_keys: Set[str],
all_citations: Set[str],
bib_files: List[Path],
tex_files: List[Path],
) -> Dict[str, Any]:
"""Generate structured citation data.
Args:
all_bib_keys: All citation keys found in .bib files
all_citations: All citations found in .tex files
bib_files: List of .bib files processed
tex_files: List of .tex files processed
Returns:
Dictionary with citation statistics and details
"""
cited = sorted(all_bib_keys & all_citations)
uncited = sorted(all_bib_keys - all_citations)
missing = sorted(all_citations - all_bib_keys)
return {
"summary": {
"total_references": len(all_bib_keys),
"total_citations": len(all_citations),
"successfully_cited": len(cited),
"uncited": len(uncited),
"missing": len(missing),
},
"details": {
"successfully_cited": cited,
"uncited_references": uncited,
"missing_references": missing,
},
"files": {
"bib_files": [str(f) for f in sorted(bib_files)],
"tex_files": [str(f) for f in sorted(tex_files)],
},
}
def print_text_report(
data: Dict[str, Any],
logger: logging.Logger,
show_details: bool = True,
show_sections: Dict[str, bool] = None,
):
"""Print citation report to console using logger.
Args:
data: Citation data dictionary
logger: Logger instance
show_details: Show detailed lists of citations
show_sections: Dictionary indicating which sections to show
"""
if show_sections is None:
show_sections = {"cited": True, "uncited": True, "missing": True}
summary = data["summary"]
details = data["details"]
print("\n" + "=" * 80)
print("CITATION STATUS REPORT")
print("=" * 80 + "\n")
# Summary (always show)
print("SUMMARY")
print("-" * 80)
logger.info(f"Total references in .bib files: {summary['total_references']}")
logger.info(f"Total citations in .tex files: {summary['total_citations']}")
logger.success(f"Successfully cited: {summary['successfully_cited']}")
if summary["uncited"] > 0:
logger.warning(f"Uncited references: {summary['uncited']}")
else:
logger.info(f"Uncited references: {summary['uncited']}")
if summary["missing"] > 0:
logger.error(f"Missing references: {summary['missing']}")
else:
logger.info(f"Missing references: {summary['missing']}")
print()
if not show_details:
print("=" * 80 + "\n")
return
# Successfully cited
if show_sections.get("cited", False):
print("SUCCESSFULLY CITED REFERENCES")
print("-" * 80)
if details["successfully_cited"]:
for key in details["successfully_cited"]:
logger.success(f" ✓ {key}")
else:
logger.info(" (none)")
print()
# Uncited references
if show_sections.get("uncited", False):
print("UNCITED REFERENCES (in .bib but not cited in .tex)")
print("-" * 80)
if details["uncited_references"]:
for key in details["uncited_references"]:
logger.warning(f" ⚠ {key}")
else:
logger.success(" ✓ All references are cited")
print()
# Missing references
if show_sections.get("missing", False):
print("MISSING REFERENCES (cited in .tex but not in .bib)")
print("-" * 80)
if details["missing_references"]:
for key in details["missing_references"]:
logger.error(f" ✗ {key}")
else:
logger.success(" ✓ All citations have references")
print()
print("=" * 80 + "\n")
def main():
parser = argparse.ArgumentParser(
description="Check citation states in manuscript",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Check all citation states (default: show all)
%(prog)s
# Show only successfully cited references
%(prog)s --bib-cited
# Show only uncited references (in .bib but not cited)
%(prog)s --bib-uncited
# Show only missing references (cited but not in .bib)
%(prog)s --bib-missing
# Show summary only (no detailed lists)
%(prog)s --summary-only
# Output as JSON
%(prog)s --json
# Save to custom file
%(prog)s --output citation_report.txt
# Verbose logging
%(prog)s --verbose
""",
)
# Filter options (if none specified, show all)
parser.add_argument(
"--bib-cited",
action="store_true",
help="Show only successfully cited references",
)
parser.add_argument(
"--bib-uncited",
action="store_true",
help="Show only uncited references (in .bib but not cited in .tex)",
)
parser.add_argument(
"--bib-missing",
action="store_true",
help="Show only missing references (cited in .tex but not in .bib)",
)
# Output options
parser.add_argument(
"--summary-only",
action="store_true",
help="Show only summary statistics (no detailed lists)",
)
parser.add_argument(
"--json",
action="store_true",
help="Output in JSON format (default output: 00_shared/bib_files/cited_states.json)",
)
parser.add_argument(
"--config",
type=Path,
help="Path to config YAML file (default: auto-detect from script location)",
)
parser.add_argument(
"--output",
"-o",
type=Path,
help="Output file path (default: 00_shared/bib_files/cited_states.json for JSON, stdout for text)",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
# Setup logger
logger = getLogger(__name__, args.verbose)
# Determine what to show based on filter flags
filter_flags = {
"cited": args.bib_cited,
"uncited": args.bib_uncited,
"missing": args.bib_missing,
}
# If any filter is specified, show only those
# Otherwise, show all
if any(filter_flags.values()):
show_sections = {
name: enabled for name, enabled in filter_flags.items() if enabled
}
else:
show_sections = {"cited": True, "uncited": True, "missing": True}
# Load configuration
try:
load_config(args.config)
except FileNotFoundError as e:
logger.error(str(e))
return 1
# Get project root
if args.config:
project_root = args.config.parent.parent
else:
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parent.parent
# Find all .bib and .tex files
bib_dir = project_root / "00_shared"
manuscript_dir = project_root / "01_manuscript"
logger.info("Scanning files...")
bib_files = find_all_bib_files(bib_dir)
tex_files = find_all_tex_files(manuscript_dir)
if not bib_files:
logger.warning(f"No .bib files found in {bib_dir}")
if not tex_files:
logger.warning(f"No .tex files found in {manuscript_dir}")
logger.debug(f"Found {len(bib_files)} .bib files")
logger.debug(f"Found {len(tex_files)} .tex files")
# Extract all citation keys from .bib files
all_bib_keys = set()
for bib_file in bib_files:
keys = extract_bib_keys(bib_file)
all_bib_keys.update(keys)
logger.debug(f" {bib_file.name}: {len(keys)} references")
# Extract all citations from .tex files
all_citations = set()
for tex_file in tex_files:
citations = extract_citations_from_tex(tex_file)
all_citations.update(citations)
if citations:
logger.debug(f" {tex_file.name}: {len(citations)} citations")
# Generate citation data
data = generate_citation_data(all_bib_keys, all_citations, bib_files, tex_files)
# Always save JSON to default location unless custom output specified
default_json_path = project_root / "00_shared" / "bib_files" / "cited_states.json"
default_json_path.parent.mkdir(parents=True, exist_ok=True)
# Save JSON data to default location (always)
if not args.output:
json_str = json.dumps(data, indent=2, ensure_ascii=False)
default_json_path.write_text(json_str, encoding="utf-8")
# Output
if args.json:
# JSON output to stdout or custom file
if args.output:
json_str = json.dumps(data, indent=2, ensure_ascii=False)
args.output.write_text(json_str, encoding="utf-8")
logger.success(f"Saved to: {args.output}")
else:
# Already saved to default location, just print
print(json.dumps(data, indent=2, ensure_ascii=False))
logger.success(f"Saved to: {default_json_path}")
else:
# Text output
if args.output:
# For text file output, generate plain text without colors
with open(args.output, "w", encoding="utf-8") as f:
f.write("=" * 80 + "\n")
f.write("CITATION STATUS REPORT\n")
f.write("=" * 80 + "\n\n")
f.write("SUMMARY\n")
f.write("-" * 80 + "\n")
f.write(
f"Total references in .bib files: {data['summary']['total_references']}\n"
)
f.write(
f"Total citations in .tex files: {data['summary']['total_citations']}\n"
)
f.write(
f"Successfully cited: {data['summary']['successfully_cited']}\n"
)
f.write(
f"Uncited references: {data['summary']['uncited']}\n"
)
f.write(
f"Missing references: {data['summary']['missing']}\n\n"
)
if not args.summary_only:
f.write("SUCCESSFULLY CITED REFERENCES\n")
f.write("-" * 80 + "\n")
for key in data["details"]["successfully_cited"]:
f.write(f" ✓ {key}\n")
f.write("\n")
f.write("UNCITED REFERENCES (in .bib but not cited in .tex)\n")
f.write("-" * 80 + "\n")
for key in data["details"]["uncited_references"]:
f.write(f" ⚠ {key}\n")
f.write("\n")
f.write("MISSING REFERENCES (cited in .tex but not in .bib)\n")
f.write("-" * 80 + "\n")
for key in data["details"]["missing_references"]:
f.write(f" ✗ {key}\n")
f.write("\n")
f.write("=" * 80 + "\n")
logger.success(f"Saved to: {args.output}")
else:
# Console output with colors (JSON already saved to default location)
print_text_report(
data,
logger,
show_details=not args.summary_only,
show_sections=show_sections,
)
logger.success(f"Saved to: {default_json_path}")
# Return exit code based on results
if data["summary"]["missing"] > 0:
return 1 # Error: missing references
return 0
if __name__ == "__main__":
main()
# EOF