ArXiv API Quick Reference
API Endpoint
The ArXiv API uses a single endpoint for all queries:
http://export.arxiv.org/api/query
No authentication required. The API returns results in Atom 1.0 XML format.
Rate Limits
ArXiv has no hard rate limits but requires responsible usage (Terms of Use):
- 3 second delay between requests when making multiple calls
- Maximum 30,000 results per query
- Results retrieved in slices up to 2,000 at a time
- Queries returning >1,000 results should be refined for performance
See the User Manual for detailed guidelines.
Query Parameters
Essential parameters for API requests:
| Parameter | Description | Example |
|---|---|---|
search_query |
Search terms with field prefixes | cat:cs.LG |
start |
Starting index (0-based) | 0 |
max_results |
Number of results to return | 10 |
sortBy |
Sort order | submittedDate |
sortOrder |
Sort direction | descending |
Search Query Syntax
ArXiv uses field prefixes and boolean operators (query details):
Field Prefixes
ti: Title
au: Author
abs: Abstract
cat: Subject category
id: ArXiv ID
Examples
# Papers in machine learning category
query = "cat:cs.LG"
# Papers by specific author
query = "au:goodfellow"
# Title containing "attention"
query = "ti:attention"
# Combine with AND/OR
query = "au:bengio AND ti:deep learning"
# Complex query with grouping
query = "(cat:cs.LG OR cat:cs.AI) AND ti:transformer"Making Requests with Python
Basic request pattern:
import urllib.request
import urllib.parse
import xml.etree.ElementTree as ET
def search_arxiv(query, max_results=10):
"""Search ArXiv and return parsed results."""
base_url = "http://export.arxiv.org/api/query"
# Build query parameters
params = {
'search_query': query,
'start': 0,
'max_results': max_results,
'sortBy': 'submittedDate',
'sortOrder': 'descending'
}
# Construct URL
url = f"{base_url}?{urllib.parse.urlencode(params)}"
# Fetch with timeout
with urllib.request.urlopen(url, timeout=30) as response:
xml_data = response.read()
# Parse XML
root = ET.fromstring(xml_data)
return root
# Example usage
results = search_arxiv("cat:cs.LG", max_results=5)Parsing Response Data
The response uses Atom XML with namespaces:
def parse_arxiv_response(xml_root):
"""Extract paper data from ArXiv XML response."""
# Define namespace
ns = {'atom': 'http://www.w3.org/2005/Atom'}
papers = []
for entry in xml_root.findall('atom:entry', ns):
# Extract ID (last part of URL)
id_text = entry.find('atom:id', ns).text
arxiv_id = id_text.split('/')[-1]
# Extract basic fields
title = entry.find('atom:title', ns).text.strip()
summary = entry.find('atom:summary', ns).text.strip()
# Extract all authors
authors = []
for author in entry.findall('atom:author', ns):
name = author.find('atom:name', ns)
if name is not None:
authors.append(name.text)
# Extract categories
categories = [cat.get('term')
for cat in entry.findall('{http://arxiv.org/schemas/atom}category')]
papers.append({
'id': arxiv_id,
'title': title,
'authors': authors,
'summary': summary,
'categories': categories
})
return papersPagination for Large Results
Handle results beyond the 2000-item slice limit:
import time
def fetch_all_results(query, total_results=100):
"""Fetch results with pagination and rate limiting."""
base_url = "http://export.arxiv.org/api/query"
results = []
start = 0
chunk_size = min(100, total_results) # Fetch 100 at a time
while start < total_results:
params = {
'search_query': query,
'start': start,
'max_results': min(chunk_size, total_results - start)
}
url = f"{base_url}?{urllib.parse.urlencode(params)}"
# Fetch chunk
with urllib.request.urlopen(url, timeout=30) as response:
xml_data = response.read()
root = ET.fromstring(xml_data)
# Parse and add to results
papers = parse_arxiv_response(root)
results.extend(papers)
# Update position
start += chunk_size
# Rate limit: 3 second delay
if start < total_results:
time.sleep(3)
return resultsCommon Query Patterns
# Recent papers in a category (last 7 days)
from datetime import datetime, timedelta
end_date = datetime.now()
start_date = end_date - timedelta(days=7)
date_query = f"cat:cs.LG AND submittedDate:[{start_date.strftime('%Y%m%d')}0000 TO {end_date.strftime('%Y%m%d')}2359]"
# Papers by multiple authors
author_query = "(au:lecun OR au:hinton OR au:bengio)"
# Exclude certain categories
exclude_query = "cat:cs.LG ANDNOT cat:cs.CV"
# Papers with specific terms in abstract
abstract_query = "abs:transformer AND abs:attention"Error Handling
Handle common API issues:
import urllib.error
def robust_arxiv_fetch(query, max_results=10, max_retries=3):
"""Fetch with retry logic and error handling."""
for attempt in range(max_retries):
try:
results = search_arxiv(query, max_results)
return results
except urllib.error.HTTPError as e:
if e.code == 400:
print(f"Bad query syntax: {query}")
return None
elif e.code == 503:
# Service unavailable, wait and retry
wait_time = 3 ** (attempt + 1)
print(f"Service unavailable, waiting {wait_time}s...")
time.sleep(wait_time)
else:
print(f"HTTP Error {e.code}: {e.reason}")
return None
except urllib.error.URLError as e:
print(f"Network error: {e.reason}")
if attempt < max_retries - 1:
time.sleep(3)
else:
return None
return NoneImportant Links
- API Basics - Overview and getting started
- User Manual - Detailed documentation
- Terms of Use - Required reading before using API
- API Mailing List - Support and announcements