ArXiv API Quick Reference

API Endpoint

The ArXiv API uses a single endpoint for all queries:

http://export.arxiv.org/api/query

No authentication required. The API returns results in Atom 1.0 XML format.

Rate Limits

ArXiv has no hard rate limits but requires responsible usage (Terms of Use):

  • 3 second delay between requests when making multiple calls
  • Maximum 30,000 results per query
  • Results retrieved in slices up to 2,000 at a time
  • Queries returning >1,000 results should be refined for performance

See the User Manual for detailed guidelines.

Query Parameters

Essential parameters for API requests:

Parameter Description Example
search_query Search terms with field prefixes cat:cs.LG
start Starting index (0-based) 0
max_results Number of results to return 10
sortBy Sort order submittedDate
sortOrder Sort direction descending

Search Query Syntax

ArXiv uses field prefixes and boolean operators (query details):

Field Prefixes

ti:  Title
au:  Author  
abs: Abstract
cat: Subject category
id:  ArXiv ID

Examples

# Papers in machine learning category
query = "cat:cs.LG"

# Papers by specific author
query = "au:goodfellow"

# Title containing "attention" 
query = "ti:attention"

# Combine with AND/OR
query = "au:bengio AND ti:deep learning"

# Complex query with grouping
query = "(cat:cs.LG OR cat:cs.AI) AND ti:transformer"

Making Requests with Python

Basic request pattern:

import urllib.request
import urllib.parse
import xml.etree.ElementTree as ET

def search_arxiv(query, max_results=10):
    """Search ArXiv and return parsed results."""
    
    base_url = "http://export.arxiv.org/api/query"
    
    # Build query parameters
    params = {
        'search_query': query,
        'start': 0,
        'max_results': max_results,
        'sortBy': 'submittedDate',
        'sortOrder': 'descending'
    }
    
    # Construct URL
    url = f"{base_url}?{urllib.parse.urlencode(params)}"
    
    # Fetch with timeout
    with urllib.request.urlopen(url, timeout=30) as response:
        xml_data = response.read()
    
    # Parse XML
    root = ET.fromstring(xml_data)
    return root

# Example usage
results = search_arxiv("cat:cs.LG", max_results=5)

Parsing Response Data

The response uses Atom XML with namespaces:

def parse_arxiv_response(xml_root):
    """Extract paper data from ArXiv XML response."""
    
    # Define namespace
    ns = {'atom': 'http://www.w3.org/2005/Atom'}
    
    papers = []
    for entry in xml_root.findall('atom:entry', ns):
        # Extract ID (last part of URL)
        id_text = entry.find('atom:id', ns).text
        arxiv_id = id_text.split('/')[-1]
        
        # Extract basic fields
        title = entry.find('atom:title', ns).text.strip()
        summary = entry.find('atom:summary', ns).text.strip()
        
        # Extract all authors
        authors = []
        for author in entry.findall('atom:author', ns):
            name = author.find('atom:name', ns)
            if name is not None:
                authors.append(name.text)
        
        # Extract categories
        categories = [cat.get('term') 
                     for cat in entry.findall('{http://arxiv.org/schemas/atom}category')]
        
        papers.append({
            'id': arxiv_id,
            'title': title,
            'authors': authors,
            'summary': summary,
            'categories': categories
        })
    
    return papers

Pagination for Large Results

Handle results beyond the 2000-item slice limit:

import time

def fetch_all_results(query, total_results=100):
    """Fetch results with pagination and rate limiting."""
    
    base_url = "http://export.arxiv.org/api/query"
    results = []
    start = 0
    chunk_size = min(100, total_results)  # Fetch 100 at a time
    
    while start < total_results:
        params = {
            'search_query': query,
            'start': start,
            'max_results': min(chunk_size, total_results - start)
        }
        
        url = f"{base_url}?{urllib.parse.urlencode(params)}"
        
        # Fetch chunk
        with urllib.request.urlopen(url, timeout=30) as response:
            xml_data = response.read()
            root = ET.fromstring(xml_data)
            
        # Parse and add to results
        papers = parse_arxiv_response(root)
        results.extend(papers)
        
        # Update position
        start += chunk_size
        
        # Rate limit: 3 second delay
        if start < total_results:
            time.sleep(3)
    
    return results

Common Query Patterns

# Recent papers in a category (last 7 days)
from datetime import datetime, timedelta

end_date = datetime.now()
start_date = end_date - timedelta(days=7)
date_query = f"cat:cs.LG AND submittedDate:[{start_date.strftime('%Y%m%d')}0000 TO {end_date.strftime('%Y%m%d')}2359]"

# Papers by multiple authors
author_query = "(au:lecun OR au:hinton OR au:bengio)"

# Exclude certain categories
exclude_query = "cat:cs.LG ANDNOT cat:cs.CV"

# Papers with specific terms in abstract
abstract_query = "abs:transformer AND abs:attention"

Error Handling

Handle common API issues:

import urllib.error

def robust_arxiv_fetch(query, max_results=10, max_retries=3):
    """Fetch with retry logic and error handling."""
    
    for attempt in range(max_retries):
        try:
            results = search_arxiv(query, max_results)
            return results
            
        except urllib.error.HTTPError as e:
            if e.code == 400:
                print(f"Bad query syntax: {query}")
                return None
            elif e.code == 503:
                # Service unavailable, wait and retry
                wait_time = 3 ** (attempt + 1)
                print(f"Service unavailable, waiting {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"HTTP Error {e.code}: {e.reason}")
                return None
                
        except urllib.error.URLError as e:
            print(f"Network error: {e.reason}")
            if attempt < max_retries - 1:
                time.sleep(3)
            else:
                return None
    
    return None