#!/usr/bin/env python3 # -*- coding: utf-8 -*- # File: src/scitex_writer/migration/_parsing.py """LaTeX parsing helpers for Overleaf migration.""" import re from pathlib import Path from typing import Optional # Image extensions recognised as figures IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".pdf", ".eps", ".svg", ".tif", ".tiff"} # Table data extensions TABLE_EXTS = {".csv", ".tsv"} # Priority names for detecting the main .tex file MAIN_TEX_PRIORITY = ["main.tex", "paper.tex", "manuscript.tex", "article.tex"] # IMRAD section patterns (filename or \section{} heading match) SECTION_PATTERNS = { "abstract": [r"abstract", r"summary"], "introduction": [r"intro", r"background"], "methods": [r"method", r"material", r"experimental", r"procedure"], "results": [r"result", r"finding", r"observation"], "discussion": [r"discuss", r"conclu", r"implication"], } # Canonical scitex-writer section filenames IMRAD_SECTIONS = ["abstract", "introduction", "methods", "results", "discussion"] def read_tex(path: Path) -> str: """Read a .tex file with encoding fallback.""" for enc in ("utf-8", "latin-1"): try: return path.read_text(encoding=enc) except UnicodeDecodeError: continue return "" def detect_main_tex(extracted_dir: Path) -> Optional[Path]: """Find the root .tex file containing \\documentclass. Strategy: 1. Scan all .tex files for \\documentclass (not in comments) 2. If exactly one found, return it 3. If multiple: prefer main.tex > paper.tex > manuscript.tex > article.tex 4. If still ambiguous, prefer shallowest directory depth """ candidates = [] for tex_file in extracted_dir.rglob("*.tex"): content = read_tex(tex_file) if re.search(r"^[^%]*\\documentclass", content, re.MULTILINE): candidates.append(tex_file) if not candidates: return None if len(candidates) == 1: return candidates[0] for name in MAIN_TEX_PRIORITY: matches = [c for c in candidates if c.name == name] if matches: return matches[0] return min(candidates, key=lambda p: len(p.relative_to(extracted_dir).parts)) def parse_inputs(main_tex_path: Path, base_dir: Path, _depth: int = 0) -> list[dict]: """Parse \\input{} and \\include{} directives from a .tex file. Recursively follows nested \\input up to depth 10. """ if _depth > 10: return [] content = read_tex(main_tex_path) results = [] pattern = re.compile(r"^[^%\n]*\\(input|include)\{([^}]+)\}", re.MULTILINE) for match in pattern.finditer(content): cmd = match.group(1) arg = match.group(2).strip() rel_path = Path(arg) if not rel_path.suffix: rel_path = rel_path.with_suffix(".tex") resolved = (main_tex_path.parent / rel_path).resolve() if not resolved.exists(): resolved = (base_dir / rel_path).resolve() entry = { "command": cmd, "arg": arg, "resolved_path": resolved, "exists": resolved.exists(), } results.append(entry) if resolved.exists() and resolved.suffix == ".tex": nested = parse_inputs(resolved, base_dir, _depth + 1) results.extend(nested) return results def extract_metadata(content: str) -> dict: """Extract \\title{}, \\author{}, keywords from main .tex content.""" metadata = {"title": None, "authors_block": None, "keywords": None} title_match = re.search(r"\\title\{([^}]+)\}", content) if title_match: metadata["title"] = title_match.group(1).strip() author_match = re.search(r"\\author\{([^}]+)\}", content) if author_match: metadata["authors_block"] = author_match.group(1).strip() kw_match = re.search( r"\\begin\{keyword[s]?\}(.*?)\\end\{keyword[s]?\}", content, re.DOTALL ) if not kw_match: kw_match = re.search(r"\\keywords\{([^}]+)\}", content) if kw_match: metadata["keywords"] = kw_match.group(1).strip() return metadata def classify_section(tex_path: Path, content: str) -> Optional[str]: """Map a .tex file to an IMRAD section name.""" name = tex_path.stem.lower() for section, patterns in SECTION_PATTERNS.items(): if any(re.search(p, name) for p in patterns): return section section_cmd = re.search(r"\\section\*?\{([^}]+)\}", content) if section_cmd: heading = section_cmd.group(1).lower() for section, patterns in SECTION_PATTERNS.items(): if any(re.search(p, heading) for p in patterns): return section return None def split_inline_sections(content: str) -> dict[str, str]: """Split monolithic main.tex body into IMRAD sections by \\section{} boundaries.""" pattern = re.compile(r"\\section\*?\{([^}]+)\}") matches = list(pattern.finditer(content)) if not matches: return {} sections = {} for i, match in enumerate(matches): heading = match.group(1).strip() start = match.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(content) section_content = content[start:end].strip() heading_lower = heading.lower() section_name = None for name, pats in SECTION_PATTERNS.items(): if any(re.search(p, heading_lower) for p in pats): section_name = name break if section_name: if section_name in sections: sections[section_name] += "\n\n" + section_content else: sections[section_name] = section_content return sections def find_bib_files(d: Path) -> list[Path]: """Find all .bib files.""" return sorted(d.rglob("*.bib")) def find_image_files(d: Path) -> list[Path]: """Find all image files, excluding compiled-looking PDFs.""" images = [] skip_stems = {"main", "output", "manuscript", "paper"} for f in d.rglob("*"): if f.suffix.lower() in IMAGE_EXTS and f.is_file(): if f.suffix.lower() == ".pdf" and f.stem in skip_stems: continue images.append(f) return sorted(images) def find_table_files(d: Path) -> list[Path]: """Find CSV and TSV files.""" return sorted(f for f in d.rglob("*") if f.suffix.lower() in TABLE_EXTS) def find_style_files(d: Path) -> list[Path]: """Find custom .cls, .sty, and .bst files.""" styles = [] for ext in ("*.cls", "*.sty", "*.bst"): styles.extend(d.rglob(ext)) return sorted(styles) def unique_dest(dest: Path) -> Path: """Return dest if it doesn't exist; otherwise append _N suffix.""" if not dest.exists(): return dest stem, suffix = dest.stem, dest.suffix n = 1 while True: candidate = dest.parent / f"{stem}_{n}{suffix}" if not candidate.exists(): return candidate n += 1 # EOF