#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Timestamp: "2025-11-12 12:57:38 (ywatanabe)" """ Generate AI2 Asta prompt from manuscript files for finding related papers Functionalities: - Extracts title, keywords, authors, and abstract from manuscript .tex files - Generates formatted prompt for AI2 Asta - Can save to file or print to stdout - Supports both co-author paper search and general related paper search - Flexible section inclusion/exclusion - Optional bibliography statistics - Multiple output formats Dependencies: - packages: - pathlib - re - argparse - yaml IO: - input-files: - 00_shared/title.tex - 00_shared/keywords.tex - 00_shared/authors.tex - 01_manuscript/contents/abstract.tex - 00_shared/**/*.bib (optional, for stats) - output-files: - Prompt text to stdout or file """ import argparse import json import re from pathlib import Path from typing import Any, Dict, List, Set import yaml from _logging import getLogger HEADER = """# Literature Search Request We are preparing a manuscript with the information provided below. 1. Please identify related papers that may be relevant to our work. 2. Comprehensive results are welcome, as we will evaluate all suggestions for relevance. 3. Your contribution to advancing scientific research is greatly appreciated. 4. If possible, please output as a BibTeX file (.bib). """ def load_config(config_path: Path = None) -> Dict[str, Any]: """Load configuration from YAML file. Args: config_path: Path to config file. If None, uses default location. Returns: Configuration dictionary """ if config_path is None: # Default to ../config/config_manuscript.yaml relative to script location script_dir = Path(__file__).resolve().parent config_path = script_dir.parent.parent / "config" / "config_manuscript.yaml" if not config_path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") with open(config_path, "r") as f: config = yaml.safe_load(f) return config def read_tex_content(tex_path: Path) -> str: """Read raw content from .tex file, removing comments. Args: tex_path: Path to .tex file Returns: Raw tex content without comments (empty string if file doesn't exist) """ if not tex_path.exists(): return "" content = tex_path.read_text(encoding="utf-8") # Remove comment lines (lines starting with %) lines = content.split("\n") lines = [line for line in lines if not line.strip().startswith("%")] return "\n".join(lines).strip() def clean_latex_content(content: str) -> str: """Clean LaTeX commands from content, keeping only the text. Args: content: Raw LaTeX content Returns: Cleaned text content """ import re # Remove PDF bookmarks first content = re.sub(r"\\pdfbookmark\[[^\]]*\]\{[^}]*\}\{[^}]*\}", "", content) # Remove correlation references content = re.sub(r"\\corref\{[^}]*\}", "", content) # Remove author numbering like \author[1]{Name} content = re.sub(r"\\author\[[^\]]*\]\{([^}]*)\}", r"\1", content) # Remove addresses content = re.sub(r"\\address\[[^\]]*\]\{[^}]*\}", "", content) # Remove cortext content = re.sub(r"\\cortext\[[^\]]*\]\{[^}]*\}", "", content) # Remove environment markers content = re.sub(r"\\begin\{[^}]+\}", "", content) content = re.sub(r"\\end\{[^}]+\}", "", content) # Remove standalone commands with optional arguments content = re.sub(r"\\[a-zA-Z]+\[[^\]]*\]", "", content) # Remove common LaTeX commands but keep their content (iteratively) # \command{content} -> content for _ in range(3): # Multiple passes for nested commands content = re.sub(r"\\[a-zA-Z]+\{([^{}]*)\}", r"\1", content) # Remove any remaining backslash commands content = re.sub(r"\\[a-zA-Z]+", "", content) # Remove special characters content = re.sub(r"\\&", "&", content) content = re.sub(r"\\_", "_", content) content = re.sub(r"\{", "", content) content = re.sub(r"\}", "", content) # Clean up keyword separators content = re.sub(r"\\sep", ", ", content) # Clean up multiple spaces and newlines content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content) content = re.sub(r" +", " ", content) content = re.sub(r"^\s+", "", content, flags=re.MULTILINE) return content.strip() def extract_bib_keys(bib_path: Path) -> Set[str]: """Extract all citation keys from a .bib file. Args: bib_path: Path to .bib file Returns: Set of citation keys found in the file """ if not bib_path.exists(): return set() content = bib_path.read_text(encoding="utf-8") # Match @article{key, @book{key, etc. pattern = r"@\w+\s*\{\s*([^,\s]+)" keys = re.findall(pattern, content) return set(keys) def get_bibliography_stats(bib_dir: Path) -> Dict[str, int]: """Get statistics about bibliography files. Args: bib_dir: Path to bibliography directory Returns: Dictionary with bibliography statistics """ bib_files = list(bib_dir.rglob("*.bib")) all_keys = set() for bib_file in bib_files: keys = extract_bib_keys(bib_file) all_keys.update(keys) return { "total_references": len(all_keys), "total_files": len(bib_files), } def add_title(parts: List[str], title: str) -> None: """Add title section to prompt parts. Args: parts: List to append prompt parts to title: Title content (LaTeX will be cleaned) """ title_clean = clean_latex_content(title) if title else "" if title_clean: parts.append(f"## Title\n{title_clean}") parts.append("") def add_keywords(parts: List[str], keywords: str) -> None: """Add keywords section to prompt parts. Args: parts: List to append prompt parts to keywords: Keywords content (LaTeX will be cleaned) """ keywords_clean = clean_latex_content(keywords) if keywords else "" if keywords_clean: parts.append(f"## Keywords\n{keywords_clean}") parts.append("") def add_authors(parts: List[str], authors: str) -> None: """Add authors section to prompt parts. Args: parts: List to append prompt parts to authors: Authors content (LaTeX will be cleaned) """ authors_clean = clean_latex_content(authors) if authors else "" if authors_clean: parts.append(f"## Authors\n{authors_clean}") parts.append("") def add_abstract(parts: List[str], abstract: str) -> None: """Add abstract section to prompt parts. Args: parts: List to append prompt parts to abstract: Abstract content (LaTeX will be cleaned) """ abstract_clean = clean_latex_content(abstract) if abstract else "" if abstract_clean: parts.append(f"## Abstract\n{abstract_clean}") def add_citation_info( parts: List[str], citation_data: Dict[str, Any], bib_filters: Dict[str, bool], ) -> None: """Add citation information sections to prompt parts. Args: parts: List to append prompt parts to citation_data: Citation data from check_cited_states.py bib_filters: Dictionary indicating which citation lists to include """ if not citation_data or not any(bib_filters.values()): return parts.append("") parts.append("---") parts.append("") details = citation_data.get("details", {}) # Add cited references if bib_filters.get("cited", False): cited_refs = details.get("successfully_cited", []) if cited_refs: parts.append(f"### Already Cited References ({len(cited_refs)})") for ref in cited_refs: parts.append(f"- `{ref}`") parts.append("") # Add uncited references if bib_filters.get("uncited", False): uncited_refs = details.get("uncited_references", []) if uncited_refs: parts.append( f"### Uncited References in Our Bibliography ({len(uncited_refs)})" ) parts.append("*These might be relevant to cite*") for ref in uncited_refs: parts.append(f"- `{ref}`") parts.append("") # Add missing references if bib_filters.get("missing", False): missing_refs = details.get("missing_references", []) if missing_refs: parts.append(f"### Missing References ({len(missing_refs)})") parts.append("*Cited but not in our bibliography - need to find*") for ref in missing_refs: parts.append(f"- `{ref}`") parts.append("") def generate_ai2_prompt( title: str, keywords: str, authors: str, abstract: str, sections: List[str] = None, citation_data: Dict[str, Any] = None, bib_filters: Dict[str, bool] = None, ) -> str: """Generate AI2 Asta prompt in markdown format. Args: title: Paper title keywords: Keywords authors: Author names abstract: Abstract text sections: List of sections to include (default: all) citation_data: Citation data from check_cited_states.py (optional) bib_filters: Dictionary indicating which citation lists to include Returns: Formatted prompt for AI2 Asta in markdown """ if sections is None: sections = ["title", "keywords", "authors", "abstract"] if bib_filters is None: bib_filters = {} # Build the prompt header header = HEADER parts = [header, ""] # Add requested sections if "title" in sections: add_title(parts, title) if "keywords" in sections: add_keywords(parts, keywords) if "authors" in sections: add_authors(parts, authors) if "abstract" in sections: add_abstract(parts, abstract) # Add citation information add_citation_info(parts, citation_data, bib_filters) return "\n".join(parts).strip() def main(): parser = argparse.ArgumentParser( description="Generate AI2 Asta prompt from manuscript files", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Generate prompt for related papers (default: all sections, markdown format) %(prog)s # Only title and abstract %(prog)s --title --abstract # Everything except authors (for blind review) %(prog)s --title --keywords --abstract # Include uncited refs to get suggestions for citing them (recommended!) %(prog)s --bib-uncited # Include all citation info %(prog)s --bib-cited --bib-uncited --bib-missing # Save to custom file %(prog)s --output ai2_prompt.txt # Verbose output %(prog)s --verbose """, ) # Section selection (if none specified, include all) parser.add_argument( "--title", action="store_true", help="Include title section", ) parser.add_argument( "--keywords", action="store_true", help="Include keywords section", ) parser.add_argument( "--authors", action="store_true", help="Include authors section", ) parser.add_argument( "--abstract", action="store_true", help="Include abstract section", ) # Bibliography options parser.add_argument( "--bib-cited", action="store_true", help="Include list of successfully cited references in the prompt", ) parser.add_argument( "--bib-uncited", action="store_true", help="Include list of uncited references in the prompt", ) parser.add_argument( "--bib-missing", action="store_true", help="Include list of missing references in the prompt", ) # Other options parser.add_argument( "--config", type=Path, help="Path to config YAML file (default: auto-detect from script location)", ) parser.add_argument( "--output", "-o", type=Path, help="Output file path (default: print to stdout)", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Enable verbose logging", ) args = parser.parse_args() # Setup logger logger = getLogger(__name__, args.verbose) # Determine which sections to include # If any section flag is specified, include only those # Otherwise, include all sections section_flags = { "title": args.title, "keywords": args.keywords, "authors": args.authors, "abstract": args.abstract, } if any(section_flags.values()): sections = [name for name, enabled in section_flags.items() if enabled] else: sections = ["title", "keywords", "authors", "abstract"] # Load configuration load_config(args.config) # Get project root (where the config file is located) if args.config: project_root = args.config.parent.parent else: script_dir = Path(__file__).resolve().parent project_root = script_dir.parent.parent # Extract information from manuscript files # Use paths relative to project root title_path = project_root / "00_shared" / "title.tex" keywords_path = project_root / "00_shared" / "keywords.tex" authors_path = project_root / "00_shared" / "authors.tex" abstract_path = project_root / "01_manuscript" / "contents" / "abstract.tex" logger.info("Reading manuscript files...") title = read_tex_content(title_path) keywords = read_tex_content(keywords_path) authors = read_tex_content(authors_path) abstract = read_tex_content(abstract_path) # Validate required content if not title and "title" in sections: logger.warning("No title found in 00_shared/title.tex") if not abstract and "abstract" in sections: logger.warning("No abstract found in 01_manuscript/contents/abstract.tex") # Load citation data if any bib options are specified citation_data = None if args.bib_cited or args.bib_uncited or args.bib_missing: citation_json_path = ( project_root / "00_shared" / "bib_files" / "cited_states.json" ) if citation_json_path.exists(): try: citation_data = json.loads( citation_json_path.read_text(encoding="utf-8") ) logger.info(f"Loaded citation data from {citation_json_path.name}") except Exception as e: logger.warning(f"Could not load citation data: {e}") else: logger.warning("Citation data not found. Run check_cited_states.py first.") logger.warning(f"Expected: {citation_json_path}") # Determine which citation filters are active bib_filters = { "cited": args.bib_cited, "uncited": args.bib_uncited, "missing": args.bib_missing, } # Generate prompt in markdown format prompt = generate_ai2_prompt( title, keywords, authors, abstract, sections, citation_data, bib_filters, ) # Create structured data for JSON output data = { "metadata": { "title": title, "keywords": keywords, "authors": authors, "abstract": abstract, }, "prompt": prompt, "sections_included": sections, "citation_filters": bib_filters if any(bib_filters.values()) else None, "citation_data": (citation_data.get("summary") if citation_data else None), } # Always save to default locations unless custom output specified default_json_path = project_root / "00_shared" / "ai2_prompt_data.json" default_md_path = project_root / "00_shared" / "ai2_prompt.md" default_json_path.parent.mkdir(parents=True, exist_ok=True) # Save JSON and markdown to default locations (always) if not args.output: # Save JSON data json_str = json.dumps(data, indent=2, ensure_ascii=False) default_json_path.write_text(json_str, encoding="utf-8") # Save markdown prompt default_md_path.write_text(prompt, encoding="utf-8") # Output if args.output: args.output.write_text(prompt, encoding="utf-8") logger.success(f"Saved to: {args.output}") logger.info("\nNext steps:") logger.info("1. Visit https://asta.allen.ai/chat/") logger.info(f"2. Paste the prompt from {args.output}") logger.info("3. Click 'Export All Citations' to download BibTeX file") else: print("\n" + "=" * 80) print("AI2 ASTA PROMPT") print("=" * 80) print() print(prompt) print() print("=" * 80) logger.info("\nNext steps:") logger.info("1. Visit https://asta.allen.ai/chat/") logger.info("2. Copy and paste the prompt above") logger.info("3. Click 'Export All Citations' to download BibTeX file") logger.success(f"Saved to: {default_md_path}") logger.success(f"Saved to: {default_json_path}") if __name__ == "__main__": main() # EOF