#!/usr/bin/env python3 # -*- coding: utf-8 -*- # File: scripts/python/check_references.py # Purpose: Validate all cross-references, citations, and labels in LaTeX manuscripts # Usage: # python check_references.py [project_dir] [--doc-type manuscript|supplementary|all] # python check_references.py [project_dir] --log # Also parse .log for LaTeX warnings # # Checks: # 1. Undefined references: \ref{X} where \label{X} doesn't exist # 2. Undefined citations: \cite{X} where X not in any .bib file # 3. Multiply defined labels: \label{X} defined more than once # 4. Orphan labels: \label{X} never referenced # 5. Orphan bib entries: @article{X,...} never cited (info only) # 6. LaTeX log warnings (optional --log) import argparse import re import sys from collections import defaultdict from pathlib import Path # ANSI colors GREEN = "\033[0;32m" YELLOW = "\033[1;33m" RED = "\033[0;31m" DIM = "\033[0;90m" BOLD = "\033[1m" NC = "\033[0m" PASS_COUNT = 0 WARN_COUNT = 0 FAIL_COUNT = 0 def log_pass(msg): global PASS_COUNT print(f" {GREEN}[PASS]{NC} {msg}") PASS_COUNT += 1 def log_warn(msg): global WARN_COUNT print(f" {YELLOW}[WARN]{NC} {msg}") WARN_COUNT += 1 def log_fail(msg): global FAIL_COUNT print(f" {RED}[FAIL]{NC} {msg}") FAIL_COUNT += 1 def log_detail(msg): print(f" {DIM}{msg}{NC}") def collect_tex_files(doc_dir): """Collect SOURCE .tex files only (not generated/compiled files). Scans: contents/*.tex, caption_and_media/*.tex, base.tex Skips: manuscript.tex, manuscript_diff.tex, supplementary.tex, etc. (auto-generated) """ # Patterns for generated/archived files to skip skip_patterns = re.compile(r"_v\d+\.tex$|_diff\.tex$") files = [] content_dir = doc_dir / "contents" if content_dir.exists(): for f in content_dir.glob("*.tex"): if not skip_patterns.search(f.name): files.append(f) for subdir in ["figures/caption_and_media", "tables/caption_and_media"]: d = content_dir / subdir if d.exists(): files.extend(d.glob("*.tex")) # Include base.tex (structural template) but NOT generated files base = doc_dir / "base.tex" if base.exists(): files.append(base) return list(set(files)) def extract_refs(tex_files): """Extract all \\ref{...} from tex files. Returns dict: ref_key -> [(file, line_no), ...] """ refs = defaultdict(list) pattern = re.compile(r"\\ref\{([^}]+)\}") for f in tex_files: text = f.read_text(encoding="utf-8", errors="replace") for line_no, line in enumerate(text.splitlines(), 1): # Skip comments stripped = line.split("%")[0] if "%" in line else line for m in pattern.finditer(stripped): key = m.group(1) # Skip LaTeX macro arguments like #1 if key.startswith("#"): continue refs[key].append((f, line_no)) return dict(refs) def extract_labels(tex_files): """Extract all \\label{...} from tex files. Returns dict: label_key -> [(file, line_no), ...] """ labels = defaultdict(list) pattern = re.compile(r"\\label\{([^}]+)\}") for f in tex_files: text = f.read_text(encoding="utf-8", errors="replace") for line_no, line in enumerate(text.splitlines(), 1): stripped = line.split("%")[0] if "%" in line else line for m in pattern.finditer(stripped): labels[m.group(1)].append((f, line_no)) return dict(labels) def infer_auto_labels(doc_dir): """Infer labels auto-generated by scitex-writer preprocessing. The compile pipeline creates \\label{fig:STEM} and \\label{tab:STEM} from filenames in caption_and_media/ directories. Returns dict: label_key -> [(file, 0), ...] (line 0 = auto-generated) """ labels = defaultdict(list) content_dir = doc_dir / "contents" if not content_dir.exists(): return dict(labels) for float_type, subdir in [("fig", "figures"), ("tab", "tables")]: media_dir = content_dir / subdir / "caption_and_media" if not media_dir.exists(): continue for f in media_dir.glob("[0-9]*.tex"): stem = f.stem # Skip panel patterns (e.g., 01a_name) if re.match(r"^\d+[a-zA-Z]_", stem): continue key = f"{float_type}:{stem}" labels[key].append((f, 0)) return dict(labels) def extract_citations(tex_files): """Extract all \\cite{...}, \\citep{...}, \\citet{...} etc. Handles multi-key citations like \\citep{Key1, Key2}. Returns dict: cite_key -> [(file, line_no), ...] """ cites = defaultdict(list) pattern = re.compile( r"\\(?:cite|citep|citet|citealt|citeauthor|citeyear)\{([^}]+)\}" ) for f in tex_files: text = f.read_text(encoding="utf-8", errors="replace") for line_no, line in enumerate(text.splitlines(), 1): stripped = line.split("%")[0] if "%" in line else line for m in pattern.finditer(stripped): keys = m.group(1) for key in keys.split(","): key = key.strip() if key: cites[key].append((f, line_no)) return dict(cites) def extract_bib_keys(bib_dir): """Extract all entry keys from .bib files. Returns dict: bib_key -> bib_file """ keys = {} if not bib_dir.exists(): return keys pattern = re.compile(r"@\w+\{([^,\s]+)") for f in bib_dir.glob("*.bib"): text = f.read_text(encoding="utf-8", errors="replace") for m in pattern.finditer(text): keys[m.group(1).strip()] = f return keys def parse_log_warnings(log_file): """Parse LaTeX .log file for reference/citation warnings. Returns list of warning strings. """ warnings = [] if not log_file.exists(): return warnings text = log_file.read_text(encoding="utf-8", errors="replace") for line in text.splitlines(): if "Reference" in line and "undefined" in line: warnings.append(line.strip()) elif "Citation" in line and "undefined" in line: warnings.append(line.strip()) elif "multiply defined" in line: warnings.append(line.strip()) return warnings def check_undefined_refs(refs, labels, doc_label): """Check for \\ref{X} where no \\label{X} exists.""" missing = {k: v for k, v in refs.items() if k not in labels} if not missing: log_pass(f"All references resolved ({doc_label}): {len(refs)} refs") else: log_fail(f"Undefined references ({doc_label}): {len(missing)} broken") for key, locations in sorted(missing.items()): for f, line in locations: log_detail(f"{f.name}:{line}: \\ref{{{key}}} -> ?? (no \\label)") def check_undefined_cites(cites, bib_keys, doc_label): """Check for \\cite{X} where X not in any .bib file.""" missing = {k: v for k, v in cites.items() if k not in bib_keys} if not missing: log_pass(f"All citations resolved ({doc_label}): {len(cites)} citations") else: log_fail(f"Undefined citations ({doc_label}): {len(missing)} missing from .bib") for key, locations in sorted(missing.items()): for f, line in locations: log_detail(f"{f.name}:{line}: \\cite{{{key}}} -> not in bibliography") def check_multiply_defined(labels, doc_label): """Check for \\label{X} defined more than once.""" dupes = {k: v for k, v in labels.items() if len(v) > 1} if not dupes: log_pass(f"No multiply-defined labels ({doc_label})") else: log_warn(f"Multiply-defined labels ({doc_label}): {len(dupes)} duplicates") for key, locations in sorted(dupes.items()): for f, line in locations: log_detail(f"{f.name}:{line}: \\label{{{key}}}") def check_orphan_labels(refs, labels, doc_label): """Check for \\label{X} never referenced.""" # Exclude common structural labels that are referenced by LaTeX internals structural_prefixes = ( "star ", "acknowledgment", "author ", "declaration", "data and code", "figures", "tables", ) orphans = {} for k, v in labels.items(): if k not in refs: if not any(k.lower().startswith(p) for p in structural_prefixes): orphans[k] = v if not orphans: log_pass(f"No orphan labels ({doc_label})") else: log_warn( f"Orphan labels ({doc_label}): {len(orphans)} defined but never referenced" ) for key, locations in sorted(orphans.items()): for f, line in locations: log_detail(f"{f.name}:{line}: \\label{{{key}}} never referenced") def check_orphan_bib(cites, bib_keys): """Report bib entries that are never cited (info only).""" all_cited = set(cites.keys()) unused = {k: v for k, v in bib_keys.items() if k not in all_cited} total = len(bib_keys) used = total - len(unused) if not unused: log_pass(f"All {total} bib entries cited") else: # This is info, not a warning - unused bib entries are fine print( f" {DIM}[INFO]{NC} Bibliography: {used}/{total} entries cited, {len(unused)} unused" ) def check_log_warnings(log_file, doc_label): """Parse LaTeX log for reference warnings.""" warnings = parse_log_warnings(log_file) if not warnings: log_pass(f"No LaTeX warnings ({doc_label})") else: log_warn(f"LaTeX warnings ({doc_label}): {len(warnings)}") for w in warnings[:20]: log_detail(w) if len(warnings) > 20: log_detail(f"... and {len(warnings) - 20} more") def main(): global PASS_COUNT, WARN_COUNT, FAIL_COUNT parser = argparse.ArgumentParser( description="Check cross-references, citations, and labels in LaTeX manuscripts" ) parser.add_argument( "project_dir", nargs="?", default=".", help="Project root directory (default: current directory)", ) parser.add_argument( "--doc-type", choices=["manuscript", "supplementary", "all"], default="all", help="Which document type to check (default: all)", ) parser.add_argument( "--log", action="store_true", help="Also parse LaTeX .log files for warnings", ) args = parser.parse_args() project_dir = Path(args.project_dir).resolve() bib_dir = project_dir / "00_shared" / "bib_files" # Collect document directories doc_dirs = [] if args.doc_type in ("manuscript", "all"): d = project_dir / "01_manuscript" if d.exists(): doc_dirs.append(("manuscript", d)) if args.doc_type in ("supplementary", "all"): d = project_dir / "02_supplementary" if d.exists(): doc_dirs.append(("supplementary", d)) if not doc_dirs: print(f"{RED}No document directories found in {project_dir}{NC}") return 1 print(f"\n{BOLD}=== Reference Check ==={NC}\n") # Global bib keys bib_keys = extract_bib_keys(bib_dir) # Aggregate all refs/labels/cites across all doc types for cross-document references all_refs = {} all_labels = {} all_cites = {} for doc_label, doc_dir in doc_dirs: tex_files = collect_tex_files(doc_dir) refs = extract_refs(tex_files) labels = extract_labels(tex_files) auto_labels = infer_auto_labels(doc_dir) cites = extract_citations(tex_files) # Merge into global for k, v in refs.items(): all_refs.setdefault(k, []).extend(v) for k, v in labels.items(): all_labels.setdefault(k, []).extend(v) # Only add auto-labels if not already explicitly defined for k, v in auto_labels.items(): if k not in all_labels: all_labels.setdefault(k, []).extend(v) for k, v in cites.items(): all_cites.setdefault(k, []).extend(v) # Run checks on aggregated data check_undefined_refs(all_refs, all_labels, "all documents") check_undefined_cites(all_cites, bib_keys, "all documents") check_multiply_defined(all_labels, "all documents") check_orphan_labels(all_refs, all_labels, "all documents") check_orphan_bib(all_cites, bib_keys) # Optionally check LaTeX logs if args.log: print() for doc_label, doc_dir in doc_dirs: log_dir = doc_dir / "logs" if log_dir.exists(): for log_file in log_dir.glob("*.log"): if log_file.name.startswith(("manuscript", "supplementary")): check_log_warnings(log_file, f"{doc_label}/{log_file.name}") # Summary print() print( f"{BOLD}Summary:{NC} " f"{GREEN}{PASS_COUNT} passed{NC}, " f"{YELLOW}{WARN_COUNT} warnings{NC}, " f"{RED}{FAIL_COUNT} errors{NC}" ) if FAIL_COUNT > 0: print(f"\n{RED}Broken references will show as ?? in the compiled PDF.{NC}") return 1 elif WARN_COUNT > 0: print(f"\n{YELLOW}Warnings may indicate issues worth reviewing.{NC}") return 0 else: print(f"\n{GREEN}All references and citations are valid.{NC}") return 0 if __name__ == "__main__": sys.exit(main())