#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Timestamp: "2025-11-09 21:30:00 (ywatanabe)" # File: ./scripts/python/merge_bibliographies.py """ Merge multiple BibTeX files with smart deduplication. Deduplication strategy: 1. By DOI (if available) 2. By normalized title + year 3. Merges metadata from duplicates """ import argparse import hashlib import json import re from pathlib import Path from typing import List, Optional try: import bibtexparser from bibtexparser.bibdatabase import BibDatabase from bibtexparser.bwriter import BibTexWriter except ImportError: print("ERROR: bibtexparser not installed") print("Install with: pip install bibtexparser") exit(1) def normalize_title(title: str) -> str: """Normalize title for comparison (lowercase, no punctuation).""" if not title: return "" # Remove LaTeX commands title = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", title) # Remove special characters and extra whitespace title = re.sub(r"[^\w\s]", "", title.lower()) title = re.sub(r"\s+", " ", title).strip() return title def get_doi(entry: dict) -> str: """Extract DOI from entry.""" doi = entry.get("doi", "").strip() if doi: # Normalize DOI (remove URL prefix if present) doi = re.sub(r"https?://doi.org/", "", doi, flags=re.IGNORECASE) doi = re.sub(r"https?://dx.doi.org/", "", doi, flags=re.IGNORECASE) return doi def merge_entries(existing: dict, duplicate: dict) -> dict: """Merge metadata from duplicate entries, preferring more complete info.""" merged = existing.copy() # Prefer entries with more fields for key, value in duplicate.items(): if key not in merged or not merged[key]: merged[key] = value elif value and len(str(value)) > len(str(merged[key])): # Prefer longer/more detailed field merged[key] = value return merged def deduplicate_entries(entries: List[dict]) -> tuple[List[dict], dict]: """ Deduplicate BibTeX entries by DOI and title. Returns: (unique_entries, stats) """ unique = [] doi_index = {} # DOI -> index in unique list title_index = {} # (normalized_title, year) -> index in unique list duplicates_found = 0 duplicates_merged = 0 for entry in entries: doi = get_doi(entry) title = entry.get("title", "") year = entry.get("year", "") title_norm = normalize_title(title) is_duplicate = False merge_with_idx = None # Check by DOI first (most reliable) if doi and doi in doi_index: is_duplicate = True merge_with_idx = doi_index[doi] duplicates_found += 1 # Check by title + year elif title_norm and year: key = (title_norm, year) if key in title_index: is_duplicate = True merge_with_idx = title_index[key] duplicates_found += 1 if is_duplicate and merge_with_idx is not None: # Merge metadata with existing entry merge_with = unique[merge_with_idx] merged = merge_entries(merge_with, entry) # Update in unique list unique[merge_with_idx] = merged # Indices remain the same (still pointing to same position) # No need to update doi_index or title_index duplicates_merged += 1 else: # New unique entry new_idx = len(unique) unique.append(entry) # Index it by position if doi: doi_index[doi] = new_idx if title_norm and year: title_index[(title_norm, year)] = new_idx stats = { "total_input": len(entries), "unique_output": len(unique), "duplicates_found": duplicates_found, "duplicates_merged": duplicates_merged, } return unique, stats def calculate_files_hash(bib_files: List[Path]) -> str: """ Calculate MD5 hash of all input .bib files. Args: bib_files: List of .bib file paths Returns: Hex digest of combined file hashes """ hasher = hashlib.md5() # Sort files for consistent hashing for bib_file in sorted(bib_files, key=lambda x: x.name): # Include filename in hash hasher.update(bib_file.name.encode("utf-8")) # Include file content hash with open(bib_file, "rb") as f: file_hash = hashlib.md5(f.read()).hexdigest() hasher.update(file_hash.encode("utf-8")) return hasher.hexdigest() def load_cache(cache_file: Path) -> Optional[dict]: """Load cache from file.""" if not cache_file.exists(): return None try: with open(cache_file, "r", encoding="utf-8") as f: return json.load(f) except (json.JSONDecodeError, IOError): return None def save_cache(cache_file: Path, data: dict) -> None: """Save cache to file.""" try: with open(cache_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) except IOError as e: print(f"WARNING: Could not save cache: {e}") def is_cache_valid(cache_file: Path, bib_files: List[Path], output_file: Path) -> bool: """ Check if cached merge is still valid. Args: cache_file: Path to cache file bib_files: List of input .bib files output_file: Path to output file Returns: True if cache is valid and merge can be skipped """ # No cache file if not cache_file.exists(): return False # No output file if not output_file.exists(): return False # Load cache cache = load_cache(cache_file) if not cache: return False # Calculate current hash current_hash = calculate_files_hash(bib_files) # Compare with cached hash return cache.get("input_hash") == current_hash def merge_bibtex_files( bib_dir: Path, output_file: str = "bibliography.bib", verbose: bool = True, force: bool = False, ) -> bool: """ Merge all .bib files in directory with smart deduplication. Args: bib_dir: Directory containing .bib files output_file: Output filename (saved in bib_dir) verbose: Print progress messages force: Force merge even if cache is valid Returns: True if successful """ bib_dir = Path(bib_dir) output_path = bib_dir / output_file cache_file = bib_dir / ".bibliography_cache.json" # Find all .bib files except the output file bib_files = [f for f in bib_dir.glob("*.bib") if f.name != output_file] if not bib_files: if verbose: print(f"No .bib files found in {bib_dir}") return False # Check cache (skip if force=True) if not force and is_cache_valid(cache_file, bib_files, output_path): if verbose: print("āœ“ Bibliography cache valid (no changes detected)") print(" Use --force to rebuild") return True if verbose: print(f"Merging {len(bib_files)} bibliography files...") for f in sorted(bib_files): print(f" - {f.name}") # Load and parse all files all_entries = [] for bib_file in bib_files: try: with open(bib_file, "r", encoding="utf-8") as f: parser = bibtexparser.bparser.BibTexParser( common_strings=True, ignore_nonstandard_types=False ) bib_db = bibtexparser.load(f, parser=parser) all_entries.extend(bib_db.entries) if verbose: print( f" Loaded: {len(bib_db.entries)} entries from {bib_file.name}" ) except Exception as e: print(f"ERROR: Failed to parse {bib_file}: {e}") continue # Deduplicate unique_entries, stats = deduplicate_entries(all_entries) # Create output database output_db = BibDatabase() output_db.entries = unique_entries # Write output writer = BibTexWriter() writer.indent = " " # 2-space indentation writer.order_entries_by = "ID" # Sort by citation key with open(output_path, "w", encoding="utf-8") as f: f.write(writer.write(output_db)) # Save cache input_hash = calculate_files_hash(bib_files) cache_data = { "input_hash": input_hash, "input_files": [f.name for f in sorted(bib_files, key=lambda x: x.name)], "output_file": output_file, "stats": stats, } save_cache(cache_file, cache_data) if verbose: print(f"\nāœ“ Merged bibliography saved: {output_path}") print(f" Input entries: {stats['total_input']}") print(f" Unique entries: {stats['unique_output']}") print(f" Duplicates removed: {stats['duplicates_merged']}") return True def main(): """Command-line interface.""" parser = argparse.ArgumentParser( description="Merge BibTeX files with smart deduplication" ) parser.add_argument( "bib_dir", nargs="?", default="00_shared/bib_files", help="Directory containing .bib files (default: 00_shared/bib_files)", ) parser.add_argument( "-o", "--output", default="bibliography.bib", help="Output filename (default: bibliography.bib)", ) parser.add_argument( "-q", "--quiet", action="store_true", help="Quiet mode (no output)" ) parser.add_argument( "-f", "--force", action="store_true", help="Force merge (ignore cache)" ) args = parser.parse_args() success = merge_bibtex_files( bib_dir=Path(args.bib_dir), output_file=args.output, verbose=not args.quiet, force=args.force, ) exit(0 if success else 1) if __name__ == "__main__": main() # EOF