#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: 2026-01-27
# File: src/scitex_writer/prompts/_ai2.py
"""AI2 Asta (Semantic Scholar) prompt generation from manuscript files.
Uses tag-based templates that can be customized via environment variables:
SCITEX_WRITER_PROMPT_ASTA_RELATED=/path/to/custom_related.md
SCITEX_WRITER_PROMPT_ASTA_COAUTHORS=/path/to/custom_coauthors.md
Available tags in templates:
{title} - Manuscript title
{abstract} - Manuscript abstract
{keywords} - Keywords (comma-separated)
{authors} - Author names (comma-separated)
"""
import os
import re
from pathlib import Path
from typing import TypedDict
# Default data directory
_DEFAULT_DATA_DIR = Path(__file__).parent / "data"
# Template tags
TEMPLATE_TAGS = ["title", "abstract", "keywords", "authors"]
class AI2PromptResult(TypedDict):
"""Result from AI2 prompt generation."""
success: bool
prompt: str
search_type: str
next_steps: list[str]
error: str | None
def _get_template_path(search_type: str) -> Path:
"""Get path to Asta template, checking env overrides first."""
env_key = f"SCITEX_WRITER_PROMPT_ASTA_{search_type.upper()}"
if env_path := os.environ.get(env_key):
return Path(env_path)
if prompt_dir := os.environ.get("SCITEX_WRITER_PROMPT_DIR"):
custom_path = Path(prompt_dir) / f"asta_{search_type}.md"
if custom_path.exists():
return custom_path
return _DEFAULT_DATA_DIR / f"asta_{search_type}.md"
def _read_tex_content(tex_path: Path) -> str:
"""Read raw content from .tex file, removing comments.
Args:
tex_path: Path to the .tex file.
Returns:
Content with LaTeX comments removed.
"""
if not tex_path.exists():
return ""
content = tex_path.read_text(encoding="utf-8")
# Remove LaTeX comments (lines starting with %)
lines = []
for line in content.split("\n"):
# Remove inline comments (but preserve escaped \%)
line = re.sub(r"(?<!\\)%.*$", "", line)
lines.append(line)
return "\n".join(lines).strip()
def _clean_latex(text: str) -> str:
"""Remove common LaTeX commands from text for plain text output.
Args:
text: Text possibly containing LaTeX commands.
Returns:
Cleaned text with LaTeX commands removed.
"""
# Remove \pdfbookmark and similar commands
text = re.sub(r"\\pdfbookmark\[[^\]]*\]\{[^}]*\}\{[^}]*\}", "", text)
# Remove \begin{...} and \end{...}
text = re.sub(r"\\begin\{[^}]*\}", "", text)
text = re.sub(r"\\end\{[^}]*\}", "", text)
# Remove \sep
text = re.sub(r"\\sep", ",", text)
# Remove common formatting commands
text = re.sub(r"\\textbf\{([^}]*)\}", r"\1", text)
text = re.sub(r"\\textit\{([^}]*)\}", r"\1", text)
text = re.sub(r"\\emph\{([^}]*)\}", r"\1", text)
# Remove \label{...}
text = re.sub(r"\\label\{[^}]*\}", "", text)
# Remove extra whitespace
text = re.sub(r"\s+", " ", text)
return text.strip()
def _extract_title(shared_path: Path) -> str:
"""Extract title from shared metadata."""
title_file = shared_path / "title.tex"
if title_file.exists():
content = _read_tex_content(title_file)
# Remove \title{} wrapper if present
match = re.search(r"\\title\{(.+?)\}", content, re.DOTALL)
if match:
return match.group(1).strip()
return content
return ""
def _extract_abstract(contents_path: Path) -> str:
"""Extract abstract from manuscript contents."""
abstract_file = contents_path / "abstract.tex"
if abstract_file.exists():
content = _read_tex_content(abstract_file)
# Remove \begin{abstract}...\end{abstract} wrapper if present
match = re.search(
r"\\begin\{abstract\}(.+?)\\end\{abstract\}",
content,
re.DOTALL,
)
if match:
content = match.group(1).strip()
# Clean LaTeX commands
return _clean_latex(content)
return ""
def _extract_keywords(shared_path: Path) -> list[str]:
"""Extract keywords from shared metadata."""
keywords_file = shared_path / "keywords.tex"
if keywords_file.exists():
content = _read_tex_content(keywords_file)
# Remove \begin{keyword}...\end{keyword} wrapper
match = re.search(
r"\\begin\{keyword\}(.+?)\\end\{keyword\}",
content,
re.DOTALL,
)
if match:
content = match.group(1)
# Replace \sep with comma for splitting
content = re.sub(r"\\sep", ",", content)
# Parse keywords (comma-separated)
keywords = [kw.strip() for kw in content.split(",") if kw.strip()]
return keywords
return []
def _extract_authors(shared_path: Path) -> list[str]:
"""Extract author names from shared metadata."""
authors_file = shared_path / "authors.tex"
if authors_file.exists():
content = _read_tex_content(authors_file)
# Try to extract author names from \author{} commands
matches = re.findall(r"\\author\{(.+?)\}", content, re.DOTALL)
if matches:
return [m.strip() for m in matches]
# Fallback: split by common separators
authors = re.split(r"[,;&]|\\and", content)
return [a.strip() for a in authors if a.strip()]
return []
def resolve_tags(
template: str,
tags: dict[str, str],
) -> str:
"""Resolve tags in a template string.
Args:
template: Template string with {tag} placeholders.
tags: Dictionary of tag -> value mappings.
Returns:
Template with all tags replaced by their values.
"""
result = template
for tag, value in tags.items():
result = result.replace(f"{{{tag}}}", value)
return result
def get_template(search_type: str) -> str:
"""Get the Asta prompt template for a search type.
Args:
search_type: One of "related" or "coauthors".
Returns:
The template content with {tag} placeholders.
Raises:
ValueError: If search_type is invalid.
FileNotFoundError: If template file not found.
"""
if search_type not in ("related", "coauthors"):
raise ValueError(
f"Invalid search_type: '{search_type}'. Use 'related' or 'coauthors'."
)
template_path = _get_template_path(search_type)
if not template_path.exists():
raise FileNotFoundError(f"Template not found: {template_path}")
return template_path.read_text(encoding="utf-8")
def build_prompt(
template: str,
title: str = "",
abstract: str = "",
keywords: list[str] | None = None,
authors: list[str] | None = None,
) -> str:
"""Build final prompt from template and tag values.
Args:
template: Template string with {tag} placeholders.
title: Manuscript title.
abstract: Manuscript abstract.
keywords: List of keywords.
authors: List of author names.
Returns:
Final prompt with all tags resolved.
"""
tags = {
"title": title or "(No title provided)",
"abstract": abstract or "(No abstract provided)",
"keywords": ", ".join(keywords) if keywords else "(No keywords)",
"authors": ", ".join(authors) if authors else "(No authors listed)",
}
return resolve_tags(template, tags)
def generate_ai2_prompt(
project_path: Path,
search_type: str = "related",
) -> AI2PromptResult:
"""Generate AI2 Asta prompt from manuscript files.
This creates a prompt suitable for Semantic Scholar's Asta AI
to find related papers or potential collaborators.
Args:
project_path: Path to scitex/writer project directory.
Should contain 00_shared/ and 01_manuscript/contents/.
search_type: Type of search. One of:
- "related": Find related papers
- "coauthors": Find potential collaborators
Returns:
Dictionary with:
- success: Whether generation succeeded
- prompt: The generated prompt for AI2 Asta
- search_type: The search type used
- next_steps: List of suggested next steps
- error: Error message if failed, None otherwise
Environment Variables:
SCITEX_WRITER_PROMPT_ASTA_RELATED: Custom template for related papers
SCITEX_WRITER_PROMPT_ASTA_COAUTHORS: Custom template for collaborators
"""
project_path = Path(project_path)
# Validate search_type and get template
try:
template = get_template(search_type)
except ValueError as e:
return AI2PromptResult(
success=False,
prompt="",
search_type=search_type,
next_steps=[],
error=str(e),
)
except FileNotFoundError as e:
return AI2PromptResult(
success=False,
prompt="",
search_type=search_type,
next_steps=[],
error=str(e),
)
# Locate directories
shared_path = project_path / "00_shared"
contents_path = project_path / "01_manuscript" / "contents"
if not shared_path.exists():
return AI2PromptResult(
success=False,
prompt="",
search_type=search_type,
next_steps=[],
error=f"Shared directory not found: {shared_path}",
)
# Extract manuscript components
title = _extract_title(shared_path)
abstract = _extract_abstract(contents_path)
keywords = _extract_keywords(shared_path)
authors = _extract_authors(shared_path)
# Check we have minimum content
if not title and not abstract:
return AI2PromptResult(
success=False,
prompt="",
search_type=search_type,
next_steps=[
"Add title to 00_shared/title.tex",
"Add abstract to 01_manuscript/contents/abstract.tex",
],
error="No title or abstract found. Cannot generate prompt.",
)
# Build the final prompt
prompt = build_prompt(
template=template,
title=title,
abstract=abstract,
keywords=keywords,
authors=authors,
)
# Define next steps based on search type
if search_type == "related":
next_steps = [
"Go to https://www.semanticscholar.org/product/semantic-reader",
"Paste the generated prompt",
"Review and save relevant papers to your library",
]
else: # coauthors
next_steps = [
"Go to https://www.semanticscholar.org/product/semantic-reader",
"Paste the generated prompt",
"Review suggested researchers and their work",
"Consider reaching out for collaboration",
]
return AI2PromptResult(
success=True,
prompt=prompt,
search_type=search_type,
next_steps=next_steps,
error=None,
)
# EOF