#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Test file for: merge_bibliographies.py
import os
import sys
from pathlib import Path
import pytest
# Add scripts/python to path for imports
ROOT_DIR = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT_DIR / "scripts" / "python"))
# Try to import bibtexparser and the functions
try:
import bibtexparser # noqa: F401
from merge_bibliographies import (
deduplicate_entries,
get_doi,
merge_entries,
normalize_title,
)
HAS_DEPS = True
except ImportError:
HAS_DEPS = False
# Skip all tests if bibtexparser not available
pytestmark = pytest.mark.skipif(not HAS_DEPS, reason="bibtexparser not installed")
class TestNormalizeTitle:
"""Test title normalization."""
def test_normalize_title_lowercase(self):
"""Should convert title to lowercase."""
result = normalize_title("The Quick Brown Fox")
assert result == "the quick brown fox"
def test_normalize_title_removes_latex(self):
"""Should remove LaTeX commands."""
result = normalize_title(r"\textbf{Brain} Activity")
assert result == "brain activity"
def test_normalize_title_removes_punctuation(self):
"""Should remove punctuation marks."""
result = normalize_title("Title: A Study, Part 1!")
assert result == "title a study part 1"
def test_normalize_title_removes_special_chars(self):
"""Should remove special characters."""
result = normalize_title("Title & Research @ 2024")
assert result == "title research 2024"
def test_normalize_title_normalizes_whitespace(self):
"""Should normalize multiple spaces to single space."""
result = normalize_title("Title with spaces")
assert result == "title with spaces"
def test_normalize_title_strips_whitespace(self):
"""Should strip leading/trailing whitespace."""
result = normalize_title(" Title ")
assert result == "title"
def test_normalize_title_empty_string(self):
"""Should handle empty string."""
result = normalize_title("")
assert result == ""
def test_normalize_title_none(self):
"""Should handle None value."""
result = normalize_title(None)
assert result == ""
def test_normalize_title_complex_latex(self):
"""Should handle complex LaTeX commands."""
result = normalize_title(r"\emph{Important} \textit{Words}")
assert result == "important words"
def test_normalize_title_unicode(self):
"""Should handle unicode characters."""
result = normalize_title("Café résumé naïve")
# Letters preserved, special chars removed
assert "caf" in result.lower()
class TestGetDoi:
"""Test DOI extraction."""
def test_get_doi_plain(self):
"""Should extract plain DOI."""
entry = {"doi": "10.1234/test"}
result = get_doi(entry)
assert result == "10.1234/test"
def test_get_doi_with_https_prefix(self):
"""Should strip https://doi.org/ prefix."""
entry = {"doi": "https://doi.org/10.1234/test"}
result = get_doi(entry)
assert result == "10.1234/test"
def test_get_doi_with_http_prefix(self):
"""Should strip http://doi.org/ prefix."""
entry = {"doi": "http://doi.org/10.1234/test"}
result = get_doi(entry)
assert result == "10.1234/test"
def test_get_doi_with_dx_prefix(self):
"""Should strip dx.doi.org prefix."""
entry = {"doi": "https://dx.doi.org/10.1234/test"}
result = get_doi(entry)
assert result == "10.1234/test"
def test_get_doi_empty(self):
"""Should return empty string for missing DOI."""
entry = {}
result = get_doi(entry)
assert result == ""
def test_get_doi_whitespace(self):
"""Should strip whitespace from DOI."""
entry = {"doi": " 10.1234/test "}
result = get_doi(entry)
assert result == "10.1234/test"
def test_get_doi_case_insensitive_url(self):
"""Should handle case-insensitive URL matching."""
entry = {"doi": "HTTPS://DOI.ORG/10.1234/test"}
result = get_doi(entry)
assert result == "10.1234/test"
class TestMergeEntries:
"""Test entry merging."""
def test_merge_entries_prefers_longer_field(self):
"""Should prefer longer field values."""
existing = {"title": "Short", "author": "Alice"}
duplicate = {"title": "Much Longer Title", "year": "2024"}
result = merge_entries(existing, duplicate)
assert result["title"] == "Much Longer Title"
assert result["author"] == "Alice"
assert result["year"] == "2024"
def test_merge_entries_fills_missing_fields(self):
"""Should fill in missing fields from duplicate."""
existing = {"title": "Title", "author": "Alice"}
duplicate = {"title": "Title", "year": "2024", "journal": "Nature"}
result = merge_entries(existing, duplicate)
assert result["year"] == "2024"
assert result["journal"] == "Nature"
def test_merge_entries_preserves_existing(self):
"""Should not overwrite existing with empty."""
existing = {"title": "Title", "author": "Alice", "year": "2024"}
duplicate = {"title": "Title", "author": "", "abstract": "Abstract"}
result = merge_entries(existing, duplicate)
# Should keep existing author (not overwrite with empty)
assert result["author"] == "Alice"
assert result["abstract"] == "Abstract"
def test_merge_entries_returns_copy(self):
"""Should return a new dict, not modify existing."""
existing = {"title": "Title"}
duplicate = {"author": "Bob"}
result = merge_entries(existing, duplicate)
# Original should be unchanged
assert "author" not in existing
assert result["author"] == "Bob"
def test_merge_entries_empty_duplicate(self):
"""Should handle empty duplicate entry."""
existing = {"title": "Title", "author": "Alice"}
duplicate = {}
result = merge_entries(existing, duplicate)
assert result["title"] == "Title"
assert result["author"] == "Alice"
def test_merge_entries_prefers_content_over_empty(self):
"""Should prefer any content over empty string."""
existing = {"title": "Title", "abstract": ""}
duplicate = {"title": "Title", "abstract": "Real abstract content"}
result = merge_entries(existing, duplicate)
assert result["abstract"] == "Real abstract content"
class TestDeduplicateEntries:
"""Test entry deduplication."""
def test_deduplicate_by_doi(self):
"""Should deduplicate by DOI."""
entries = [
{"ID": "entry1", "doi": "10.1234/test", "title": "Title", "year": "2024"},
{"ID": "entry2", "doi": "10.1234/test", "title": "Title", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
assert len(unique) == 1
assert stats["total_input"] == 2
assert stats["unique_output"] == 1
assert stats["duplicates_found"] == 1
def test_deduplicate_by_title_year(self):
"""Should deduplicate by normalized title + year."""
entries = [
{"ID": "entry1", "title": "The Brain Study", "year": "2024"},
{"ID": "entry2", "title": "The Brain Study", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
assert len(unique) == 1
assert stats["duplicates_found"] == 1
def test_deduplicate_different_years_not_duplicates(self):
"""Should not deduplicate same title with different years."""
entries = [
{"ID": "entry1", "title": "Annual Report", "year": "2023"},
{"ID": "entry2", "title": "Annual Report", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
assert len(unique) == 2
assert stats["duplicates_found"] == 0
def test_deduplicate_stats(self):
"""Should return accurate statistics."""
entries = [
{"ID": "entry1", "doi": "10.1234/a", "title": "Title A", "year": "2024"},
{"ID": "entry2", "doi": "10.1234/a", "title": "Title A", "year": "2024"},
{"ID": "entry3", "doi": "10.1234/b", "title": "Title B", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
assert stats["total_input"] == 3
assert stats["unique_output"] == 2
assert stats["duplicates_found"] == 1
assert stats["duplicates_merged"] == 1
def test_deduplicate_empty_list(self):
"""Should handle empty entry list."""
entries = []
unique, stats = deduplicate_entries(entries)
assert len(unique) == 0
assert stats["total_input"] == 0
assert stats["unique_output"] == 0
def test_deduplicate_merges_metadata(self):
"""Should merge metadata from duplicates."""
entries = [
{
"ID": "entry1",
"doi": "10.1234/test",
"title": "Title",
"author": "Alice",
},
{
"ID": "entry2",
"doi": "10.1234/test",
"title": "Title",
"abstract": "Abstract",
},
]
unique, stats = deduplicate_entries(entries)
assert len(unique) == 1
# Should have both author and abstract
assert unique[0]["author"] == "Alice"
assert unique[0]["abstract"] == "Abstract"
def test_deduplicate_doi_takes_precedence(self):
"""DOI matching should take precedence over title matching."""
entries = [
{"ID": "entry1", "doi": "10.1234/a", "title": "Title X", "year": "2024"},
{"ID": "entry2", "doi": "10.1234/a", "title": "Title Y", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
# Should be deduplicated by DOI even though titles differ
assert len(unique) == 1
def test_deduplicate_no_doi_or_title(self):
"""Should handle entries without DOI or title."""
entries = [
{"ID": "entry1", "author": "Alice"},
{"ID": "entry2", "author": "Bob"},
]
unique, stats = deduplicate_entries(entries)
# Should keep both (can't deduplicate without DOI or title+year)
assert len(unique) == 2
def test_deduplicate_latex_in_titles(self):
"""Should normalize LaTeX commands in titles for comparison."""
entries = [
{"ID": "entry1", "title": r"\textbf{Brain} Activity", "year": "2024"},
{"ID": "entry2", "title": "Brain Activity", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
# Should be considered duplicates after normalization
assert len(unique) == 1
def test_deduplicate_preserves_order(self):
"""Should preserve entry order (first occurrence wins)."""
entries = [
{"ID": "first", "doi": "10.1234/test", "title": "Title"},
{"ID": "second", "title": "Other", "year": "2024"},
{"ID": "third", "doi": "10.1234/test", "title": "Title"},
]
unique, stats = deduplicate_entries(entries)
# First entry with DOI should be kept
assert unique[0]["ID"] == "first"
assert unique[1]["ID"] == "second"
def test_deduplicate_case_insensitive_title(self):
"""Title comparison should be case-insensitive."""
entries = [
{"ID": "entry1", "title": "THE BRAIN STUDY", "year": "2024"},
{"ID": "entry2", "title": "the brain study", "year": "2024"},
]
unique, stats = deduplicate_entries(entries)
assert len(unique) == 1
if __name__ == "__main__":
pytest.main([os.path.abspath(__file__), "-v"])