Writer

ywatanabe / test4 / / / / /
No commits yet
Blame
generate_ai2_prompt.py • 16.9 KB
Raw
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-11-12 12:57:38 (ywatanabe)"


"""
Generate AI2 Asta prompt from manuscript files for finding related papers

Functionalities:
  - Extracts title, keywords, authors, and abstract from manuscript .tex files
  - Generates formatted prompt for AI2 Asta
  - Can save to file or print to stdout
  - Supports both co-author paper search and general related paper search
  - Flexible section inclusion/exclusion
  - Optional bibliography statistics
  - Multiple output formats

Dependencies:
  - packages:
    - pathlib
    - re
    - argparse
    - yaml

IO:
  - input-files:
    - 00_shared/title.tex
    - 00_shared/keywords.tex
    - 00_shared/authors.tex
    - 01_manuscript/contents/abstract.tex
    - 00_shared/**/*.bib (optional, for stats)

  - output-files:
    - Prompt text to stdout or file
"""

import argparse
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Set

import yaml
from _logging import getLogger

HEADER = """# Literature Search Request
We are preparing a manuscript with the information provided below.

1. Please identify related papers that may be relevant to our work.
2. Comprehensive results are welcome, as we will evaluate all suggestions for relevance.
3. Your contribution to advancing scientific research is greatly appreciated.
4. If possible, please output as a BibTeX file (.bib).
"""


def load_config(config_path: Path = None) -> Dict[str, Any]:
    """Load configuration from YAML file.

    Args:
        config_path: Path to config file. If None, uses default location.

    Returns:
        Configuration dictionary
    """
    if config_path is None:
        # Default to ../config/config_manuscript.yaml relative to script location
        script_dir = Path(__file__).resolve().parent
        config_path = script_dir.parent.parent / "config" / "config_manuscript.yaml"

    if not config_path.exists():
        raise FileNotFoundError(f"Config file not found: {config_path}")

    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    return config


def read_tex_content(tex_path: Path) -> str:
    """Read raw content from .tex file, removing comments.

    Args:
        tex_path: Path to .tex file

    Returns:
        Raw tex content without comments (empty string if file doesn't exist)
    """
    if not tex_path.exists():
        return ""

    content = tex_path.read_text(encoding="utf-8")

    # Remove comment lines (lines starting with %)
    lines = content.split("\n")
    lines = [line for line in lines if not line.strip().startswith("%")]

    return "\n".join(lines).strip()


def clean_latex_content(content: str) -> str:
    """Clean LaTeX commands from content, keeping only the text.

    Args:
        content: Raw LaTeX content

    Returns:
        Cleaned text content
    """
    import re

    # Remove PDF bookmarks first
    content = re.sub(r"\\pdfbookmark\[[^\]]*\]\{[^}]*\}\{[^}]*\}", "", content)

    # Remove correlation references
    content = re.sub(r"\\corref\{[^}]*\}", "", content)

    # Remove author numbering like \author[1]{Name}
    content = re.sub(r"\\author\[[^\]]*\]\{([^}]*)\}", r"\1", content)

    # Remove addresses
    content = re.sub(r"\\address\[[^\]]*\]\{[^}]*\}", "", content)

    # Remove cortext
    content = re.sub(r"\\cortext\[[^\]]*\]\{[^}]*\}", "", content)

    # Remove environment markers
    content = re.sub(r"\\begin\{[^}]+\}", "", content)
    content = re.sub(r"\\end\{[^}]+\}", "", content)

    # Remove standalone commands with optional arguments
    content = re.sub(r"\\[a-zA-Z]+\[[^\]]*\]", "", content)

    # Remove common LaTeX commands but keep their content (iteratively)
    # \command{content} -> content
    for _ in range(3):  # Multiple passes for nested commands
        content = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", content)

    # Remove any remaining backslash commands
    content = re.sub(r"\\[a-zA-Z]+", "", content)

    # Remove special characters
    content = re.sub(r"\\&", "&", content)
    content = re.sub(r"\\_", "_", content)
    content = re.sub(r"\{", "", content)
    content = re.sub(r"\}", "", content)

    # Clean up keyword separators
    content = re.sub(r"\\sep", ", ", content)

    # Clean up multiple spaces and newlines
    content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
    content = re.sub(r" +", " ", content)
    content = re.sub(r"^\s+", "", content, flags=re.MULTILINE)

    return content.strip()


def extract_bib_keys(bib_path: Path) -> Set[str]:
    """Extract all citation keys from a .bib file.

    Args:
        bib_path: Path to .bib file

    Returns:
        Set of citation keys found in the file
    """
    if not bib_path.exists():
        return set()

    content = bib_path.read_text(encoding="utf-8")

    # Match @article{key, @book{key, etc.
    pattern = r"@\w+\s*\{\s*([^,\s]+)"
    keys = re.findall(pattern, content)

    return set(keys)


def get_bibliography_stats(bib_dir: Path) -> Dict[str, int]:
    """Get statistics about bibliography files.

    Args:
        bib_dir: Path to bibliography directory

    Returns:
        Dictionary with bibliography statistics
    """
    bib_files = list(bib_dir.rglob("*.bib"))
    all_keys = set()

    for bib_file in bib_files:
        keys = extract_bib_keys(bib_file)
        all_keys.update(keys)

    return {
        "total_references": len(all_keys),
        "total_files": len(bib_files),
    }


def add_title(parts: List[str], title: str) -> None:
    """Add title section to prompt parts.

    Args:
        parts: List to append prompt parts to
        title: Title content (LaTeX will be cleaned)
    """
    title_clean = clean_latex_content(title) if title else ""
    if title_clean:
        parts.append(f"## Title\n{title_clean}")
        parts.append("")


def add_keywords(parts: List[str], keywords: str) -> None:
    """Add keywords section to prompt parts.

    Args:
        parts: List to append prompt parts to
        keywords: Keywords content (LaTeX will be cleaned)
    """
    keywords_clean = clean_latex_content(keywords) if keywords else ""
    if keywords_clean:
        parts.append(f"## Keywords\n{keywords_clean}")
        parts.append("")


def add_authors(parts: List[str], authors: str) -> None:
    """Add authors section to prompt parts.

    Args:
        parts: List to append prompt parts to
        authors: Authors content (LaTeX will be cleaned)
    """
    authors_clean = clean_latex_content(authors) if authors else ""
    if authors_clean:
        parts.append(f"## Authors\n{authors_clean}")
        parts.append("")


def add_abstract(parts: List[str], abstract: str) -> None:
    """Add abstract section to prompt parts.

    Args:
        parts: List to append prompt parts to
        abstract: Abstract content (LaTeX will be cleaned)
    """
    abstract_clean = clean_latex_content(abstract) if abstract else ""
    if abstract_clean:
        parts.append(f"## Abstract\n{abstract_clean}")


def add_citation_info(
    parts: List[str],
    citation_data: Dict[str, Any],
    bib_filters: Dict[str, bool],
) -> None:
    """Add citation information sections to prompt parts.

    Args:
        parts: List to append prompt parts to
        citation_data: Citation data from check_cited_states.py
        bib_filters: Dictionary indicating which citation lists to include
    """
    if not citation_data or not any(bib_filters.values()):
        return

    parts.append("")
    parts.append("---")
    parts.append("")

    details = citation_data.get("details", {})

    # Add cited references
    if bib_filters.get("cited", False):
        cited_refs = details.get("successfully_cited", [])
        if cited_refs:
            parts.append(f"### Already Cited References ({len(cited_refs)})")
            for ref in cited_refs:
                parts.append(f"- `{ref}`")
            parts.append("")

    # Add uncited references
    if bib_filters.get("uncited", False):
        uncited_refs = details.get("uncited_references", [])
        if uncited_refs:
            parts.append(
                f"### Uncited References in Our Bibliography ({len(uncited_refs)})"
            )
            parts.append("*These might be relevant to cite*")
            for ref in uncited_refs:
                parts.append(f"- `{ref}`")
            parts.append("")

    # Add missing references
    if bib_filters.get("missing", False):
        missing_refs = details.get("missing_references", [])
        if missing_refs:
            parts.append(f"### Missing References ({len(missing_refs)})")
            parts.append("*Cited but not in our bibliography - need to find*")
            for ref in missing_refs:
                parts.append(f"- `{ref}`")
            parts.append("")


def generate_ai2_prompt(
    title: str,
    keywords: str,
    authors: str,
    abstract: str,
    sections: List[str] = None,
    citation_data: Dict[str, Any] = None,
    bib_filters: Dict[str, bool] = None,
) -> str:
    """Generate AI2 Asta prompt in markdown format.

    Args:
        title: Paper title
        keywords: Keywords
        authors: Author names
        abstract: Abstract text
        sections: List of sections to include (default: all)
        citation_data: Citation data from check_cited_states.py (optional)
        bib_filters: Dictionary indicating which citation lists to include

    Returns:
        Formatted prompt for AI2 Asta in markdown
    """
    if sections is None:
        sections = ["title", "keywords", "authors", "abstract"]

    if bib_filters is None:
        bib_filters = {}

    # Build the prompt header
    header = HEADER
    parts = [header, ""]

    # Add requested sections
    if "title" in sections:
        add_title(parts, title)
    if "keywords" in sections:
        add_keywords(parts, keywords)
    if "authors" in sections:
        add_authors(parts, authors)
    if "abstract" in sections:
        add_abstract(parts, abstract)

    # Add citation information
    add_citation_info(parts, citation_data, bib_filters)

    return "\n".join(parts).strip()


def main():
    parser = argparse.ArgumentParser(
        description="Generate AI2 Asta prompt from manuscript files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Generate prompt for related papers (default: all sections, markdown format)
  %(prog)s

  # Only title and abstract
  %(prog)s --title --abstract

  # Everything except authors (for blind review)
  %(prog)s --title --keywords --abstract

  # Include uncited refs to get suggestions for citing them (recommended!)
  %(prog)s --bib-uncited

  # Include all citation info
  %(prog)s --bib-cited --bib-uncited --bib-missing

  # Save to custom file
  %(prog)s --output ai2_prompt.txt

  # Verbose output
  %(prog)s --verbose
        """,
    )

    # Section selection (if none specified, include all)
    parser.add_argument(
        "--title",
        action="store_true",
        help="Include title section",
    )
    parser.add_argument(
        "--keywords",
        action="store_true",
        help="Include keywords section",
    )
    parser.add_argument(
        "--authors",
        action="store_true",
        help="Include authors section",
    )
    parser.add_argument(
        "--abstract",
        action="store_true",
        help="Include abstract section",
    )

    # Bibliography options
    parser.add_argument(
        "--bib-cited",
        action="store_true",
        help="Include list of successfully cited references in the prompt",
    )
    parser.add_argument(
        "--bib-uncited",
        action="store_true",
        help="Include list of uncited references in the prompt",
    )
    parser.add_argument(
        "--bib-missing",
        action="store_true",
        help="Include list of missing references in the prompt",
    )

    # Other options
    parser.add_argument(
        "--config",
        type=Path,
        help="Path to config YAML file (default: auto-detect from script location)",
    )
    parser.add_argument(
        "--output",
        "-o",
        type=Path,
        help="Output file path (default: print to stdout)",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable verbose logging",
    )

    args = parser.parse_args()

    # Setup logger
    logger = getLogger(__name__, args.verbose)

    # Determine which sections to include
    # If any section flag is specified, include only those
    # Otherwise, include all sections
    section_flags = {
        "title": args.title,
        "keywords": args.keywords,
        "authors": args.authors,
        "abstract": args.abstract,
    }

    if any(section_flags.values()):
        sections = [name for name, enabled in section_flags.items() if enabled]
    else:
        sections = ["title", "keywords", "authors", "abstract"]

    # Load configuration
    load_config(args.config)

    # Get project root (where the config file is located)
    if args.config:
        project_root = args.config.parent.parent
    else:
        script_dir = Path(__file__).resolve().parent
        project_root = script_dir.parent.parent

    # Extract information from manuscript files
    # Use paths relative to project root
    title_path = project_root / "00_shared" / "title.tex"
    keywords_path = project_root / "00_shared" / "keywords.tex"
    authors_path = project_root / "00_shared" / "authors.tex"
    abstract_path = project_root / "01_manuscript" / "contents" / "abstract.tex"

    logger.info("Reading manuscript files...")

    title = read_tex_content(title_path)
    keywords = read_tex_content(keywords_path)
    authors = read_tex_content(authors_path)
    abstract = read_tex_content(abstract_path)

    # Validate required content
    if not title and "title" in sections:
        logger.warning("No title found in 00_shared/title.tex")
    if not abstract and "abstract" in sections:
        logger.warning("No abstract found in 01_manuscript/contents/abstract.tex")

    # Load citation data if any bib options are specified
    citation_data = None
    if args.bib_cited or args.bib_uncited or args.bib_missing:
        citation_json_path = (
            project_root / "00_shared" / "bib_files" / "cited_states.json"
        )
        if citation_json_path.exists():
            try:
                citation_data = json.loads(
                    citation_json_path.read_text(encoding="utf-8")
                )
                logger.info(f"Loaded citation data from {citation_json_path.name}")
            except Exception as e:
                logger.warning(f"Could not load citation data: {e}")
        else:
            logger.warning("Citation data not found. Run check_cited_states.py first.")
            logger.warning(f"Expected: {citation_json_path}")

    # Determine which citation filters are active
    bib_filters = {
        "cited": args.bib_cited,
        "uncited": args.bib_uncited,
        "missing": args.bib_missing,
    }

    # Generate prompt in markdown format
    prompt = generate_ai2_prompt(
        title,
        keywords,
        authors,
        abstract,
        sections,
        citation_data,
        bib_filters,
    )

    # Create structured data for JSON output
    data = {
        "metadata": {
            "title": title,
            "keywords": keywords,
            "authors": authors,
            "abstract": abstract,
        },
        "prompt": prompt,
        "sections_included": sections,
        "citation_filters": bib_filters if any(bib_filters.values()) else None,
        "citation_data": (citation_data.get("summary") if citation_data else None),
    }

    # Always save to default locations unless custom output specified
    default_json_path = project_root / "00_shared" / "ai2_prompt_data.json"
    default_md_path = project_root / "00_shared" / "ai2_prompt.md"
    default_json_path.parent.mkdir(parents=True, exist_ok=True)

    # Save JSON and markdown to default locations (always)
    if not args.output:
        # Save JSON data
        json_str = json.dumps(data, indent=2, ensure_ascii=False)
        default_json_path.write_text(json_str, encoding="utf-8")

        # Save markdown prompt
        default_md_path.write_text(prompt, encoding="utf-8")

    # Output
    if args.output:
        args.output.write_text(prompt, encoding="utf-8")
        logger.success(f"Saved to: {args.output}")
        logger.info("\nNext steps:")
        logger.info("1. Visit https://asta.allen.ai/chat/")
        logger.info(f"2. Paste the prompt from {args.output}")
        logger.info("3. Click 'Export All Citations' to download BibTeX file")
    else:
        print("\n" + "=" * 80)
        print("AI2 ASTA PROMPT")
        print("=" * 80)
        print()
        print(prompt)
        print()
        print("=" * 80)
        logger.info("\nNext steps:")
        logger.info("1. Visit https://asta.allen.ai/chat/")
        logger.info("2. Copy and paste the prompt above")
        logger.info("3. Click 'Export All Citations' to download BibTeX file")
        logger.success(f"Saved to: {default_md_path}")
        logger.success(f"Saved to: {default_json_path}")


if __name__ == "__main__":
    main()

# EOF