#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Test file for: generate_ai2_prompt.py
import os
import re
import sys
from pathlib import Path
# Add scripts/python to path for imports
ROOT_DIR = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT_DIR / "scripts" / "python"))
import pytest # noqa: E402
# Re-implement key functions locally for testing
def read_tex_content(tex_path):
"""Local copy for testing."""
if not tex_path.exists():
return ""
content = tex_path.read_text(encoding="utf-8")
lines = content.split("\n")
lines = [line for line in lines if not line.strip().startswith("%")]
return "\n".join(lines).strip()
def clean_latex_content(content):
"""Local copy for testing."""
# Remove PDF bookmarks first
content = re.sub(r"\\pdfbookmark\[[^\]]*\]\{[^}]*\}\{[^}]*\}", "", content)
# Remove environment markers
content = re.sub(r"\\begin\{[^}]+\}", "", content)
content = re.sub(r"\\end\{[^}]+\}", "", content)
# Remove standalone commands with optional arguments
content = re.sub(r"\\[a-zA-Z]+\[[^\]]*\]", "", content)
# Remove common LaTeX commands but keep their content (iteratively)
for _ in range(3): # Multiple passes for nested commands
content = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", content)
# Remove any remaining backslash commands
content = re.sub(r"\\[a-zA-Z]+", "", content)
# Remove special characters
content = re.sub(r"\{", "", content)
content = re.sub(r"\}", "", content)
# Clean up multiple spaces and newlines
content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
content = re.sub(r" +", " ", content)
return content.strip()
HEADER = """# Literature Search Request
We are preparing a manuscript with the information provided below.
1. Please identify related papers that may be relevant to our work.
2. Comprehensive results are welcome, as we will evaluate all suggestions for relevance.
3. Your contribution to advancing scientific research is greatly appreciated.
4. If possible, please output as a BibTeX file (.bib)."""
def generate_ai2_prompt(title, keywords, authors, abstract, sections=None):
"""Simplified version for testing."""
if sections is None:
sections = ["title", "keywords", "authors", "abstract"]
parts = [HEADER, ""]
if "title" in sections and title:
title_clean = clean_latex_content(title)
if title_clean:
parts.append(f"## Title\n{title_clean}")
parts.append("")
if "keywords" in sections and keywords:
keywords_clean = clean_latex_content(keywords)
if keywords_clean:
parts.append(f"## Keywords\n{keywords_clean}")
parts.append("")
if "authors" in sections and authors:
authors_clean = clean_latex_content(authors)
if authors_clean:
parts.append(f"## Authors\n{authors_clean}")
parts.append("")
if "abstract" in sections and abstract:
abstract_clean = clean_latex_content(abstract)
if abstract_clean:
parts.append(f"## Abstract\n{abstract_clean}")
return "\n".join(parts).strip()
# Tests for read_tex_content
def test_read_tex_content_normal(tmp_path):
"""Test reading normal tex content."""
tex_file = tmp_path / "test.tex"
tex_file.write_text("This is normal content\nWith multiple lines")
content = read_tex_content(tex_file)
assert content == "This is normal content\nWith multiple lines"
def test_read_tex_content_removes_comments(tmp_path):
"""Test that comment lines are removed."""
tex_file = tmp_path / "test.tex"
tex_file.write_text("""
This is valid content
% This is a comment line
More valid content
% Another comment
""")
content = read_tex_content(tex_file)
assert "This is valid content" in content
assert "More valid content" in content
assert "comment" not in content.lower()
def test_read_tex_content_missing_file(tmp_path):
"""Test missing file returns empty string."""
tex_file = tmp_path / "nonexistent.tex"
content = read_tex_content(tex_file)
assert content == ""
def test_read_tex_content_empty_file(tmp_path):
"""Test empty file returns empty string."""
tex_file = tmp_path / "empty.tex"
tex_file.write_text("")
content = read_tex_content(tex_file)
assert content == ""
def test_read_tex_content_only_comments(tmp_path):
"""Test file with only comments."""
tex_file = tmp_path / "comments.tex"
tex_file.write_text("""
% Comment 1
% Comment 2
% Comment 3
""")
content = read_tex_content(tex_file)
assert content == ""
# Tests for clean_latex_content
def test_clean_latex_removes_begin_end(tmp_path):
"""Test that begin/end environments are removed."""
content = "\\begin{abstract}This is text\\end{abstract}"
cleaned = clean_latex_content(content)
assert "begin" not in cleaned
assert "end" not in cleaned
assert "This is text" in cleaned
def test_clean_latex_removes_commands(tmp_path):
"""Test that LaTeX commands are removed."""
content = "This is \\textbf{bold} and \\emph{italic} text"
cleaned = clean_latex_content(content)
assert "\\textbf" not in cleaned
assert "\\emph" not in cleaned
assert "bold" in cleaned
assert "italic" in cleaned
def test_clean_latex_removes_nested(tmp_path):
"""Test nested commands are properly handled."""
content = "This is \\textbf{\\emph{nested}} text"
cleaned = clean_latex_content(content)
assert "textbf" not in cleaned
assert "emph" not in cleaned
assert "nested" in cleaned
def test_clean_latex_removes_pdfbookmark(tmp_path):
"""Test pdfbookmark commands are removed."""
content = "\\pdfbookmark[1]{Title}{label}This is content"
cleaned = clean_latex_content(content)
assert "pdfbookmark" not in cleaned
assert "This is content" in cleaned
def test_clean_latex_removes_optional_args(tmp_path):
"""Test commands with optional arguments are handled."""
content = "\\section[short]{Long Title}"
cleaned = clean_latex_content(content)
assert "section" not in cleaned
assert "Long Title" in cleaned
def test_clean_latex_multiple_spaces(tmp_path):
"""Test multiple spaces are collapsed."""
content = "This has many spaces"
cleaned = clean_latex_content(content)
assert " " not in cleaned # No double spaces
assert "This has many spaces" in cleaned
def test_clean_latex_multiple_newlines(tmp_path):
"""Test multiple newlines are collapsed."""
content = "Para 1\n\n\n\nPara 2"
cleaned = clean_latex_content(content)
# Should have at most 2 consecutive newlines
assert "\n\n\n" not in cleaned
# Tests for generate_ai2_prompt
def test_generate_prompt_has_header(tmp_path):
"""Test output starts with expected header."""
prompt = generate_ai2_prompt("Title", "", "", "")
assert prompt.startswith("# Literature Search Request")
assert "We are preparing a manuscript" in prompt
def test_generate_prompt_includes_title(tmp_path):
"""Test title appears in prompt."""
prompt = generate_ai2_prompt("Test Manuscript Title", "", "", "")
assert "## Title" in prompt
assert "Test Manuscript Title" in prompt
def test_generate_prompt_includes_abstract(tmp_path):
"""Test abstract appears in prompt."""
abstract = "This is the abstract text with important findings."
prompt = generate_ai2_prompt("", "", "", abstract)
assert "## Abstract" in prompt
assert "important findings" in prompt
def test_generate_prompt_selective_sections(tmp_path):
"""Test only requested sections are included."""
prompt = generate_ai2_prompt(
"Title Text",
"Keywords Text",
"Authors Text",
"Abstract Text",
sections=["title", "abstract"],
)
assert "## Title" in prompt
assert "## Abstract" in prompt
assert "## Keywords" not in prompt
assert "## Authors" not in prompt
def test_generate_prompt_all_sections(tmp_path):
"""Test all sections when requested."""
prompt = generate_ai2_prompt(
"Title",
"keyword1, keyword2",
"John Smith, Jane Doe",
"This is abstract",
sections=["title", "keywords", "authors", "abstract"],
)
assert "## Title" in prompt
assert "## Keywords" in prompt
assert "## Authors" in prompt
assert "## Abstract" in prompt
def test_generate_prompt_cleans_latex(tmp_path):
"""Test LaTeX commands are cleaned from content."""
prompt = generate_ai2_prompt(
"\\textbf{Bold Title}", "", "", "Abstract with \\emph{italic} words"
)
assert "\\textbf" not in prompt
assert "\\emph" not in prompt
assert "Bold Title" in prompt
assert "italic" in prompt
def test_generate_prompt_empty_sections_omitted(tmp_path):
"""Test empty sections are not included."""
prompt = generate_ai2_prompt("Title", "", "", "")
assert "## Title" in prompt
assert "## Keywords" not in prompt
assert "## Authors" not in prompt
assert "## Abstract" not in prompt
def test_generate_prompt_structure(tmp_path):
"""Test overall prompt structure is correct."""
prompt = generate_ai2_prompt(
"Test Title", "test, keywords", "Author Name", "Test abstract"
)
lines = prompt.split("\n")
# Should start with header
assert lines[0] == "# Literature Search Request"
# Should have proper section markers
title_idx = lines.index("## Title")
keywords_idx = lines.index("## Keywords")
authors_idx = lines.index("## Authors")
abstract_idx = lines.index("## Abstract")
# Sections should be in order
assert title_idx < keywords_idx < authors_idx < abstract_idx
if __name__ == "__main__":
import pytest
pytest.main([os.path.abspath(__file__), "-v"])