#!/usr/bin/env python3 # -*- coding: utf-8 -*- # File: scripts/python/explore_bibtex.py # ---------------------------------------- from __future__ import annotations import os __FILE__ = "./scripts/python/explore_bibtex.py" __DIR__ = os.path.dirname(__FILE__) # ---------------------------------------- """ BibTeX Explorer - Analyze and filter bibliography using scitex.scholar Functionalities: - Loads BibTeX file with Papers.from_bibtex() - Filters by citation count, impact factor, year, keywords - Sorts by multiple criteria - Compares against currently cited papers in manuscript - Identifies high-impact uncited papers - Exports filtered results Dependencies: - packages: - scitex.scholar - argparse - pathlib IO: - input-files: - BibTeX file (enriched with citation_count, journal_impact_factor) - Manuscript .tex files (for cited papers comparison) - output-files: - Filtered results to stdout or file """ import argparse import re import sys from pathlib import Path from typing import List, Optional, Set # Import bibtexparser try: import bibtexparser except ImportError: print("Error: bibtexparser is required. Install with: pip install bibtexparser") sys.exit(1) # Simple Paper and Papers classes (lightweight implementation) class Paper: """Lightweight Paper class for BibTeX entries.""" def __init__(self, **kwargs): self.key = kwargs.get("key", "") self.title = kwargs.get("title", "") self.authors = kwargs.get("authors", []) self.year = kwargs.get("year") self.journal = kwargs.get("journal") self.citation_count = kwargs.get("citation_count") self.journal_impact_factor = kwargs.get("journal_impact_factor") self.abstract = kwargs.get("abstract", "") self.doi = kwargs.get("doi") self.keywords = kwargs.get("keywords", []) self._original_entry = kwargs.get("_original_entry", {}) def __repr__(self): return f"Paper(key='{self.key}', title='{self.title[:50]}...')" class Papers: """Lightweight Papers collection.""" def __init__(self, papers: List[Paper]): self._papers = papers def __len__(self): return len(self._papers) def __iter__(self): return iter(self._papers) def __getitem__(self, index): return self._papers[index] @classmethod def from_bibtex(cls, filepath: Path) -> "Papers": """Load papers from BibTeX file.""" with open(filepath, "r", encoding="utf-8") as f: bib_db = bibtexparser.load(f) papers = [] for entry in bib_db.entries: # Parse fields authors = [] if "author" in entry: authors = [a.strip() for a in entry["author"].split(" and ")] year = None if "year" in entry: try: year = int(entry["year"]) except ValueError: pass citation_count = None if "citation_count" in entry: try: citation_count = int(entry["citation_count"]) except ValueError: pass impact_factor = None if "journal_impact_factor" in entry: try: impact_factor = float(entry["journal_impact_factor"]) except ValueError: pass keywords = [] if "keywords" in entry: keywords = [k.strip() for k in entry["keywords"].split(",")] paper = Paper( key=entry.get("ID", ""), title=entry.get("title", "").strip("{}"), authors=authors, year=year, journal=entry.get("journal", ""), citation_count=citation_count, journal_impact_factor=impact_factor, abstract=entry.get("abstract", ""), doi=entry.get("doi", ""), keywords=keywords, _original_entry=entry, ) papers.append(paper) return cls(papers) def filter(self, condition=None, **kwargs) -> "Papers": """Filter papers by condition or criteria.""" if condition and callable(condition): filtered = [p for p in self._papers if condition(p)] return Papers(filtered) # Apply keyword filters filtered = self._papers # Metrics filters if "min_citations" in kwargs and kwargs["min_citations"] is not None: filtered = [ p for p in filtered if p.citation_count and p.citation_count >= kwargs["min_citations"] ] if "max_citations" in kwargs and kwargs["max_citations"] is not None: filtered = [ p for p in filtered if p.citation_count and p.citation_count <= kwargs["max_citations"] ] if "min_impact_factor" in kwargs and kwargs["min_impact_factor"] is not None: filtered = [ p for p in filtered if p.journal_impact_factor and p.journal_impact_factor >= kwargs["min_impact_factor"] ] if "max_impact_factor" in kwargs and kwargs["max_impact_factor"] is not None: filtered = [ p for p in filtered if p.journal_impact_factor and p.journal_impact_factor <= kwargs["max_impact_factor"] ] # Year filter if "year_min" in kwargs and kwargs["year_min"] is not None: filtered = [p for p in filtered if p.year and p.year >= kwargs["year_min"]] if "year_max" in kwargs and kwargs["year_max"] is not None: filtered = [p for p in filtered if p.year and p.year <= kwargs["year_max"]] # Text filters if "keyword" in kwargs and kwargs["keyword"]: kw = kwargs["keyword"].lower() filtered = [ p for p in filtered if (p.title and kw in p.title.lower()) or (p.abstract and kw in p.abstract.lower()) or any(kw in k.lower() for k in p.keywords) ] if "journal" in kwargs and kwargs["journal"]: j = kwargs["journal"].lower() filtered = [p for p in filtered if p.journal and j in p.journal.lower()] if "author" in kwargs and kwargs["author"]: a = kwargs["author"].lower() filtered = [ p for p in filtered if any(a in author.lower() for author in p.authors) ] return Papers(filtered) def sort_by(self, key_func, reverse=False) -> "Papers": """Sort papers by key function or field name.""" if isinstance(key_func, str): # Handle string field names field_name = key_func def get_field(paper): value = getattr(paper, field_name, None) # Handle None values for proper sorting if value is None: return float("-inf") if reverse else float("inf") return value sorted_papers = sorted(self._papers, key=get_field, reverse=reverse) else: # Handle callable functions sorted_papers = sorted(self._papers, key=key_func, reverse=reverse) return Papers(sorted_papers) def save(self, filepath: Path, format="bibtex"): """Save papers to file.""" if format == "bibtex": # Reconstruct BibTeX bib_db = bibtexparser.bibdatabase.BibDatabase() bib_db.entries = [p._original_entry for p in self._papers] with open(filepath, "w", encoding="utf-8") as f: bibtexparser.dump(bib_db, f) def get_cited_papers(manuscript_dir: Path) -> Set[str]: """Extract all cited paper keys from manuscript .tex files. Args: manuscript_dir: Directory containing manuscript .tex files Returns: Set of cited paper keys """ cited = set() tex_files = [ "abstract.tex", "introduction.tex", "methods.tex", "results.tex", "discussion.tex", ] for fname in tex_files: fpath = manuscript_dir / fname if fpath.exists(): content = fpath.read_text() matches = re.findall(r"\\cite\{([^}]+)\}", content) for match in matches: cited.update(key.strip() for key in match.split(",")) return cited def extract_coauthors_from_tex(authors_tex_path: Path) -> List[str]: """Extract co-author names from authors.tex file. Args: authors_tex_path: Path to authors.tex file Returns: List of author names (last names) """ if not authors_tex_path.exists(): return [] authors = [] content = authors_tex_path.read_text() # Extract author names from \author[X]{Name} format author_pattern = r"\\author\[[^\]]+\]\{([^}]+)\}" matches = re.findall(author_pattern, content) for match in matches: # Remove ALL LaTeX commands and their arguments (handles nested braces) clean_name = re.sub(r"\\[a-zA-Z]+(?:\{[^}]*\})?", "", match).strip() # Extract last name (assuming format: "First Last" or "First Middle Last") parts = clean_name.split() if parts: last_name = parts[-1] authors.append(last_name) return authors def calculate_score(paper: Paper, weights: dict = None) -> float: """Calculate composite score for ranking papers. Args: paper: Paper object weights: Dictionary with 'citations' and 'impact_factor' weights Returns: Composite score """ if weights is None: weights = {"citations": 1.0, "impact_factor": 10.0} citations = paper.citation_count if paper.citation_count else 0 impact = paper.journal_impact_factor if paper.journal_impact_factor else 0 return (citations * weights["citations"]) + (impact * weights["impact_factor"]) def print_papers_table( papers: Papers, cited_keys: Optional[Set[str]] = None, show_score: bool = True, max_papers: Optional[int] = None, ): """Print papers in formatted table. Args: papers: Papers collection to display cited_keys: Set of already cited paper keys (to mark them) show_score: Whether to show composite score max_papers: Maximum number of papers to display """ if len(papers) == 0: print("No papers match the criteria.") return # Prepare header header_parts = [ ("Key", 40), ("Cites", 7), ("IF", 6), ] if show_score: header_parts.append(("Score", 8)) header_parts.extend([("Year", 6), ("Journal", 25), ("Title", 50)]) # Print header print("=" * 145) header = "" for name, width in header_parts: header += f"{name:<{width}} " print(header.rstrip()) print("-" * 145) # Print papers count = 0 for paper in papers: if max_papers and count >= max_papers: break # Check if cited is_cited = cited_keys and paper.key in cited_keys prefix = "✓ " if is_cited else " " # Format fields key = (paper.key[:38] + "..") if len(paper.key) > 40 else paper.key cites = str(paper.citation_count) if paper.citation_count else "N/A" impact = ( f"{paper.journal_impact_factor:.1f}" if paper.journal_impact_factor else "N/A" ) score = f"{calculate_score(paper):.0f}" if show_score else "" year = str(paper.year) if paper.year else "N/A" journal = ( (paper.journal[:23] + "..") if paper.journal and len(paper.journal) > 25 else (paper.journal or "N/A") ) title = ( (paper.title[:48] + "..") if paper.title and len(paper.title) > 50 else (paper.title or "No title") ) # Build row row = f"{prefix}{key:<38} {cites:<7} {impact:<6} " if show_score: row += f"{score:<8} " row += f"{year:<6} {journal:<25} {title}" print(row) count += 1 print("=" * 145) print(f"Showing {count} of {len(papers)} papers") def print_summary_stats(papers: Papers, cited_keys: Optional[Set[str]] = None): """Print summary statistics for the paper collection. Args: papers: Papers collection cited_keys: Set of cited paper keys """ print("\n" + "=" * 80) print("SUMMARY STATISTICS") print("=" * 80) total = len(papers) with_citations = len( papers.filter(lambda p: p.citation_count is not None and p.citation_count > 0) ) with_impact = len( papers.filter( lambda p: p.journal_impact_factor is not None and p.journal_impact_factor > 0 ) ) with_both = len( papers.filter( lambda p: p.citation_count is not None and p.citation_count > 0 and p.journal_impact_factor is not None and p.journal_impact_factor > 0 ) ) print(f"Total papers: {total}") print( f"Papers with citation count: {with_citations} ({with_citations / total * 100:.1f}%)" ) print( f"Papers with impact factor: {with_impact} ({with_impact / total * 100:.1f}%)" ) print(f"Papers with both metrics: {with_both} ({with_both / total * 100:.1f}%)") if cited_keys: cited_count = sum(1 for p in papers if p.key in cited_keys) uncited_count = total - cited_count print( f"\nCited in manuscript: {cited_count} ({cited_count / total * 100:.1f}%)" ) print(f"Not yet cited: {uncited_count} ({uncited_count / total * 100:.1f}%)") # Citation statistics citations = [p.citation_count for p in papers if p.citation_count] if citations: print("\nCitation count statistics:") print(f" Min: {min(citations)}") print(f" Max: {max(citations)}") print(f" Mean: {sum(citations) / len(citations):.1f}") print(f" Median: {sorted(citations)[len(citations) // 2]}") # Impact factor statistics impacts = [p.journal_impact_factor for p in papers if p.journal_impact_factor] if impacts: print("\nImpact factor statistics:") print(f" Min: {min(impacts):.1f}") print(f" Max: {max(impacts):.1f}") print(f" Mean: {sum(impacts) / len(impacts):.1f}") print(f" Median: {sorted(impacts)[len(impacts) // 2]:.1f}") print() def main(): parser = argparse.ArgumentParser( description="Explore and analyze BibTeX files using scitex.scholar", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Show all papers with metrics %(prog)s bibliography.bib # Filter high-impact papers (>100 citations, IF>5) %(prog)s bibliography.bib --min-citations 100 --min-if 5.0 # Find uncited high-impact papers %(prog)s bibliography.bib --uncited --min-score 100 # Filter by year and keyword %(prog)s bibliography.bib --year-min 2020 --keyword "seizure" # Sort by different criteria %(prog)s bibliography.bib --sort citation_count --reverse %(prog)s bibliography.bib --sort journal_impact_factor --reverse # Export filtered results %(prog)s bibliography.bib --min-citations 100 --output filtered.bib """, ) # Required arguments parser.add_argument("bibtex_file", type=Path, help="Path to BibTeX file") # Filter arguments parser.add_argument("--min-citations", type=int, help="Minimum citation count") parser.add_argument("--max-citations", type=int, help="Maximum citation count") parser.add_argument("--min-if", type=float, help="Minimum impact factor") parser.add_argument("--max-if", type=float, help="Maximum impact factor") parser.add_argument("--min-score", type=float, help="Minimum composite score") parser.add_argument("--year-min", type=int, help="Minimum publication year") parser.add_argument("--year-max", type=int, help="Maximum publication year") parser.add_argument( "--keyword", type=str, help="Filter by keyword in title/abstract" ) parser.add_argument( "--journal", type=str, help="Filter by journal name (partial match)" ) parser.add_argument( "--author", type=str, help="Filter by author name (partial match)" ) parser.add_argument( "--co-authors", action="store_true", help="Filter papers by manuscript co-authors from 00_shared/authors.tex", ) parser.add_argument( "--authors-tex", type=Path, default=Path("./00_shared/authors.tex"), help="Path to authors.tex file (default: ./00_shared/authors.tex)", ) # Comparison arguments parser.add_argument( "--manuscript-dir", type=Path, default=Path("./01_manuscript/contents"), help="Directory with manuscript .tex files (default: ./01_manuscript/contents)", ) parser.add_argument( "--cited", action="store_true", help="Show only papers already cited in manuscript", ) parser.add_argument( "--uncited", action="store_true", help="Show only papers NOT cited in manuscript", ) # Sort arguments parser.add_argument( "--sort", type=str, choices=[ "citation_count", "journal_impact_factor", "year", "title", "score", ], default="score", help="Sort papers by field (default: score)", ) parser.add_argument( "--reverse", action="store_true", help="Sort in descending order" ) # Display arguments parser.add_argument("--limit", type=int, help="Maximum number of papers to display") parser.add_argument( "--no-score", action="store_true", help="Hide composite score column" ) parser.add_argument("--stats", action="store_true", help="Show summary statistics") # Output arguments parser.add_argument( "--output", "-o", type=Path, help="Export filtered papers to BibTeX file", ) args = parser.parse_args() # Validate inputs if not args.bibtex_file.exists(): print(f"Error: BibTeX file not found: {args.bibtex_file}") sys.exit(1) if args.cited and args.uncited: print("Error: Cannot use --cited and --uncited together") sys.exit(1) # Load papers print(f"Loading papers from {args.bibtex_file}...") try: papers = Papers.from_bibtex(args.bibtex_file) print(f"✓ Loaded {len(papers)} papers\n") except Exception as e: print(f"Error loading BibTeX file: {e}") sys.exit(1) # Get cited papers if needed cited_keys = None if args.cited or args.uncited or args.manuscript_dir.exists(): if args.manuscript_dir.exists(): cited_keys = get_cited_papers(args.manuscript_dir) print(f"✓ Found {len(cited_keys)} cited papers in manuscript\n") else: print(f"Warning: Manuscript directory not found: {args.manuscript_dir}\n") # Apply filters filtered = papers # Metrics filters if args.min_citations or args.max_citations: filtered = filtered.filter( min_citations=args.min_citations, max_citations=args.max_citations ) if args.min_if or args.max_if: filtered = filtered.filter( min_impact_factor=args.min_if, max_impact_factor=args.max_if ) # Score filter (custom) if args.min_score: filtered = filtered.filter(lambda p: calculate_score(p) >= args.min_score) # Year filter if args.year_min or args.year_max: filtered = filtered.filter(year_min=args.year_min, year_max=args.year_max) # Text filters if args.keyword: filtered = filtered.filter(keyword=args.keyword) if args.journal: filtered = filtered.filter(journal=args.journal) if args.author: filtered = filtered.filter(author=args.author) # Co-authors filter if args.co_authors: if args.authors_tex.exists(): coauthors = extract_coauthors_from_tex(args.authors_tex) print(f"✓ Found co-authors: {', '.join(coauthors)}\n") # Filter papers where any co-author appears filtered = filtered.filter( lambda p: any( any(coauthor.lower() in author.lower() for author in p.authors) for coauthor in coauthors ) ) else: print(f"Warning: authors.tex not found at {args.authors_tex}\n") # Cited/uncited filter if args.cited and cited_keys: filtered = filtered.filter(lambda p: p.key in cited_keys) elif args.uncited and cited_keys: filtered = filtered.filter(lambda p: p.key not in cited_keys) print(f"Applied filters: {len(papers)} → {len(filtered)} papers\n") # Sort if args.sort == "score": # Sort by composite score (custom) filtered = filtered.sort_by( lambda p: calculate_score(p), reverse=args.reverse or True, # Default descending for score ) else: filtered = filtered.sort_by(args.sort, reverse=args.reverse) # Show statistics if args.stats: print_summary_stats(filtered, cited_keys) # Display results print_papers_table( filtered, cited_keys=cited_keys, show_score=not args.no_score, max_papers=args.limit, ) # Export if requested if args.output: filtered.save(args.output, format="bibtex") print(f"\n✓ Exported {len(filtered)} papers to {args.output}") if __name__ == "__main__": main() # EOF