Writer

ywatanabe / test-project / / / / /
No commits yet
Blame
merge_bibliographies.py • 9.7 KB
Raw
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-11-09 21:30:00 (ywatanabe)"
# File: ./scripts/python/merge_bibliographies.py
"""
Merge multiple BibTeX files with smart deduplication.

Deduplication strategy:
1. By DOI (if available)
2. By normalized title + year
3. Merges metadata from duplicates
"""

import argparse
import hashlib
import json
import re
from pathlib import Path
from typing import List, Optional

try:
    import bibtexparser
    from bibtexparser.bibdatabase import BibDatabase
    from bibtexparser.bwriter import BibTexWriter
except ImportError:
    print("ERROR: bibtexparser not installed")
    print("Install with: pip install bibtexparser")
    exit(1)


def normalize_title(title: str) -> str:
    """Normalize title for comparison (lowercase, no punctuation)."""
    if not title:
        return ""
    # Remove LaTeX commands
    title = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", title)
    # Remove special characters and extra whitespace
    title = re.sub(r"[^\w\s]", "", title.lower())
    title = re.sub(r"\s+", " ", title).strip()
    return title


def get_doi(entry: dict) -> str:
    """Extract DOI from entry."""
    doi = entry.get("doi", "").strip()
    if doi:
        # Normalize DOI (remove URL prefix if present)
        doi = re.sub(r"https?://doi.org/", "", doi, flags=re.IGNORECASE)
        doi = re.sub(r"https?://dx.doi.org/", "", doi, flags=re.IGNORECASE)
    return doi


def merge_entries(existing: dict, duplicate: dict) -> dict:
    """Merge metadata from duplicate entries, preferring more complete info."""
    merged = existing.copy()

    # Prefer entries with more fields
    for key, value in duplicate.items():
        if key not in merged or not merged[key]:
            merged[key] = value
        elif value and len(str(value)) > len(str(merged[key])):
            # Prefer longer/more detailed field
            merged[key] = value

    return merged


def deduplicate_entries(entries: List[dict]) -> tuple[List[dict], dict]:
    """
    Deduplicate BibTeX entries by DOI and title.

    Returns:
        (unique_entries, stats)
    """
    unique = []
    doi_index = {}  # DOI -> index in unique list
    title_index = {}  # (normalized_title, year) -> index in unique list
    duplicates_found = 0
    duplicates_merged = 0

    for entry in entries:
        doi = get_doi(entry)
        title = entry.get("title", "")
        year = entry.get("year", "")
        title_norm = normalize_title(title)

        is_duplicate = False
        merge_with_idx = None

        # Check by DOI first (most reliable)
        if doi and doi in doi_index:
            is_duplicate = True
            merge_with_idx = doi_index[doi]
            duplicates_found += 1

        # Check by title + year
        elif title_norm and year:
            key = (title_norm, year)
            if key in title_index:
                is_duplicate = True
                merge_with_idx = title_index[key]
                duplicates_found += 1

        if is_duplicate and merge_with_idx is not None:
            # Merge metadata with existing entry
            merge_with = unique[merge_with_idx]
            merged = merge_entries(merge_with, entry)

            # Update in unique list
            unique[merge_with_idx] = merged

            # Indices remain the same (still pointing to same position)
            # No need to update doi_index or title_index

            duplicates_merged += 1
        else:
            # New unique entry
            new_idx = len(unique)
            unique.append(entry)

            # Index it by position
            if doi:
                doi_index[doi] = new_idx
            if title_norm and year:
                title_index[(title_norm, year)] = new_idx

    stats = {
        "total_input": len(entries),
        "unique_output": len(unique),
        "duplicates_found": duplicates_found,
        "duplicates_merged": duplicates_merged,
    }

    return unique, stats


def calculate_files_hash(bib_files: List[Path]) -> str:
    """
    Calculate MD5 hash of all input .bib files.

    Args:
        bib_files: List of .bib file paths

    Returns:
        Hex digest of combined file hashes
    """
    hasher = hashlib.md5()

    # Sort files for consistent hashing
    for bib_file in sorted(bib_files, key=lambda x: x.name):
        # Include filename in hash
        hasher.update(bib_file.name.encode("utf-8"))

        # Include file content hash
        with open(bib_file, "rb") as f:
            file_hash = hashlib.md5(f.read()).hexdigest()
            hasher.update(file_hash.encode("utf-8"))

    return hasher.hexdigest()


def load_cache(cache_file: Path) -> Optional[dict]:
    """Load cache from file."""
    if not cache_file.exists():
        return None

    try:
        with open(cache_file, "r", encoding="utf-8") as f:
            return json.load(f)
    except (json.JSONDecodeError, IOError):
        return None


def save_cache(cache_file: Path, data: dict) -> None:
    """Save cache to file."""
    try:
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)
    except IOError as e:
        print(f"WARNING: Could not save cache: {e}")


def is_cache_valid(cache_file: Path, bib_files: List[Path], output_file: Path) -> bool:
    """
    Check if cached merge is still valid.

    Args:
        cache_file: Path to cache file
        bib_files: List of input .bib files
        output_file: Path to output file

    Returns:
        True if cache is valid and merge can be skipped
    """
    # No cache file
    if not cache_file.exists():
        return False

    # No output file
    if not output_file.exists():
        return False

    # Load cache
    cache = load_cache(cache_file)
    if not cache:
        return False

    # Calculate current hash
    current_hash = calculate_files_hash(bib_files)

    # Compare with cached hash
    return cache.get("input_hash") == current_hash


def merge_bibtex_files(
    bib_dir: Path,
    output_file: str = "bibliography.bib",
    verbose: bool = True,
    force: bool = False,
) -> bool:
    """
    Merge all .bib files in directory with smart deduplication.

    Args:
        bib_dir: Directory containing .bib files
        output_file: Output filename (saved in bib_dir)
        verbose: Print progress messages
        force: Force merge even if cache is valid

    Returns:
        True if successful
    """
    bib_dir = Path(bib_dir)
    output_path = bib_dir / output_file
    cache_file = bib_dir / ".bibliography_cache.json"

    # Find all .bib files except the output file
    bib_files = [f for f in bib_dir.glob("*.bib") if f.name != output_file]

    if not bib_files:
        if verbose:
            print(f"No .bib files found in {bib_dir}")
        return False

    # Check cache (skip if force=True)
    if not force and is_cache_valid(cache_file, bib_files, output_path):
        if verbose:
            print("✓ Bibliography cache valid (no changes detected)")
            print("  Use --force to rebuild")
        return True

    if verbose:
        print(f"Merging {len(bib_files)} bibliography files...")
        for f in sorted(bib_files):
            print(f"  - {f.name}")

    # Load and parse all files
    all_entries = []
    for bib_file in bib_files:
        try:
            with open(bib_file, "r", encoding="utf-8") as f:
                parser = bibtexparser.bparser.BibTexParser(
                    common_strings=True, ignore_nonstandard_types=False
                )
                bib_db = bibtexparser.load(f, parser=parser)
                all_entries.extend(bib_db.entries)
                if verbose:
                    print(
                        f"  Loaded: {len(bib_db.entries)} entries from {bib_file.name}"
                    )
        except Exception as e:
            print(f"ERROR: Failed to parse {bib_file}: {e}")
            continue

    # Deduplicate
    unique_entries, stats = deduplicate_entries(all_entries)

    # Create output database
    output_db = BibDatabase()
    output_db.entries = unique_entries

    # Write output
    writer = BibTexWriter()
    writer.indent = "  "  # 2-space indentation
    writer.order_entries_by = "ID"  # Sort by citation key

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(writer.write(output_db))

    # Save cache
    input_hash = calculate_files_hash(bib_files)
    cache_data = {
        "input_hash": input_hash,
        "input_files": [f.name for f in sorted(bib_files, key=lambda x: x.name)],
        "output_file": output_file,
        "stats": stats,
    }
    save_cache(cache_file, cache_data)

    if verbose:
        print(f"\n✓ Merged bibliography saved: {output_path}")
        print(f"  Input entries: {stats['total_input']}")
        print(f"  Unique entries: {stats['unique_output']}")
        print(f"  Duplicates removed: {stats['duplicates_merged']}")

    return True


def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description="Merge BibTeX files with smart deduplication"
    )
    parser.add_argument(
        "bib_dir",
        nargs="?",
        default="00_shared/bib_files",
        help="Directory containing .bib files (default: 00_shared/bib_files)",
    )
    parser.add_argument(
        "-o",
        "--output",
        default="bibliography.bib",
        help="Output filename (default: bibliography.bib)",
    )
    parser.add_argument(
        "-q", "--quiet", action="store_true", help="Quiet mode (no output)"
    )
    parser.add_argument(
        "-f", "--force", action="store_true", help="Force merge (ignore cache)"
    )

    args = parser.parse_args()

    success = merge_bibtex_files(
        bib_dir=Path(args.bib_dir),
        output_file=args.output,
        verbose=not args.quiet,
        force=args.force,
    )

    exit(0 if success else 1)


if __name__ == "__main__":
    main()


# EOF