HTTP Requests with urllib
Making Basic Requests
Python’s urllib.request module handles HTTP requests without external dependencies. The simplest request fetches a URL and reads the response:
import urllib.request
# Fetch a webpage
with urllib.request.urlopen('http://httpbin.org/html') as response:
html_content = response.read()
print(f"Status: {response.status}")
print(f"Content length: {len(html_content)} bytes")The urlopen() function returns a response object that works like a file. Always use with to ensure the connection closes properly.
Response Objects
The response object provides both data and metadata:
with urllib.request.urlopen('http://httpbin.org/json') as response:
# Read response body
data = response.read() # Returns bytes
# Response metadata
status_code = response.status # 200, 404, etc.
headers = response.headers # Dictionary-like object
content_type = response.headers.get('Content-Type')
# Convert bytes to string
text = data.decode('utf-8')Key response attributes:
.read()- Get response body as bytes.status- HTTP status code.headers- Response headers.url- Final URL (after redirects)
Handling Timeouts
Network requests can hang indefinitely. Always set a timeout:
import urllib.request
import urllib.error
url = 'http://httpbin.org/delay/5'
try:
# Timeout after 3 seconds
with urllib.request.urlopen(url, timeout=3) as response:
data = response.read()
except urllib.error.URLError as e:
print(f"Request failed: {e}")The timeout applies to the connection attempt, not the total download time.
Error Handling
Different errors require different handling:
import urllib.request
import urllib.error
def fetch_url(url):
try:
with urllib.request.urlopen(url, timeout=10) as response:
return response.read(), response.status
except urllib.error.HTTPError as e:
# HTTP errors (404, 500, etc.)
print(f"HTTP Error {e.code}: {e.reason}")
return None, e.code
except urllib.error.URLError as e:
# Network errors (DNS, connection refused)
print(f"URL Error: {e.reason}")
return None, None
except Exception as e:
# Other errors
print(f"Unexpected error: {e}")
return None, None
# Test with various URLs
data, status = fetch_url('http://httpbin.org/status/404')Measuring Response Time
Track how long requests take:
import time
import urllib.request
def timed_request(url):
start_time = time.time()
try:
with urllib.request.urlopen(url, timeout=10) as response:
data = response.read()
elapsed_ms = (time.time() - start_time) * 1000
return {
'status': response.status,
'size': len(data),
'time_ms': elapsed_ms
}
except Exception as e:
elapsed_ms = (time.time() - start_time) * 1000
return {
'error': str(e),
'time_ms': elapsed_ms
}
# Example usage
result = timed_request('http://httpbin.org/delay/1')
print(f"Response time: {result['time_ms']:.2f} ms")Working with Headers
Check response headers to determine content type:
def fetch_with_type_check(url):
with urllib.request.urlopen(url) as response:
content_type = response.headers.get('Content-Type', '')
data = response.read()
# Check if response is text
if 'text' in content_type or 'json' in content_type:
text = data.decode('utf-8')
return text, 'text'
else:
return data, 'binary'
# Fetch and identify content
content, content_format = fetch_with_type_check('http://httpbin.org/json')
print(f"Content format: {content_format}")Processing Multiple URLs
Pattern for fetching multiple URLs with error recovery:
import urllib.request
import urllib.error
def fetch_multiple(urls, timeout=10):
results = []
for url in urls:
try:
with urllib.request.urlopen(url, timeout=timeout) as response:
results.append({
'url': url,
'status': response.status,
'size': len(response.read()),
'success': True
})
except (urllib.error.HTTPError, urllib.error.URLError) as e:
results.append({
'url': url,
'error': str(e),
'success': False
})
return results
# Process a list of URLs
urls = [
'http://httpbin.org/status/200',
'http://httpbin.org/status/404',
'http://invalid.url.example'
]
results = fetch_multiple(urls)
for r in results:
if r['success']:
print(f"✓ {r['url']}: {r['status']}")
else:
print(f"✗ {r['url']}: {r['error']}")Testing with httpbin.org
httpbin.org provides endpoints for testing HTTP operations:
# Common test endpoints
test_urls = {
'success': 'http://httpbin.org/status/200',
'not_found': 'http://httpbin.org/status/404',
'server_error': 'http://httpbin.org/status/500',
'delay': 'http://httpbin.org/delay/2',
'json': 'http://httpbin.org/json',
'html': 'http://httpbin.org/html',
'redirect': 'http://httpbin.org/redirect/1'
}
# Test different scenarios
for name, url in test_urls.items():
try:
with urllib.request.urlopen(url, timeout=5) as response:
print(f"{name}: Status {response.status}")
except Exception as e:
print(f"{name}: Error - {e}")Complete Example: URL Fetcher
Putting it all together:
import urllib.request
import urllib.error
import json
import time
from datetime import datetime, timezone
def process_url_list(url_file_path, output_dir):
"""Fetch URLs and generate report."""
# Read URLs from file
with open(url_file_path, 'r') as f:
urls = [line.strip() for line in f if line.strip()]
results = []
for url in urls:
start_time = time.time()
try:
with urllib.request.urlopen(url, timeout=10) as response:
content = response.read()
elapsed_ms = (time.time() - start_time) * 1000
# Check if content is text
content_type = response.headers.get('Content-Type', '')
word_count = None
if 'text' in content_type:
text = content.decode('utf-8', errors='ignore')
words = text.split()
word_count = len(words)
results.append({
'url': url,
'status_code': response.status,
'response_time_ms': elapsed_ms,
'content_length': len(content),
'word_count': word_count,
'timestamp': datetime.now(timezone.utc).isoformat(),
'error': None
})
except Exception as e:
elapsed_ms = (time.time() - start_time) * 1000
results.append({
'url': url,
'status_code': None,
'response_time_ms': elapsed_ms,
'content_length': 0,
'word_count': None,
'timestamp': datetime.now(timezone.utc).isoformat(),
'error': str(e)
})
# Save results
output_path = f"{output_dir}/responses.json"
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
return resultsMethod Reference
# Basic operations
response = urllib.request.urlopen(url) # Open URL
data = response.read() # Read bytes
text = data.decode('utf-8') # Convert to string
response.status # Get status code
response.headers['Content-Type'] # Get header
# With timeout
response = urllib.request.urlopen(url, timeout=10)
# Error handling
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e: # HTTP errors
print(f"HTTP {e.code}: {e.reason}")
except urllib.error.URLError as e: # Network errors
print(f"Network error: {e.reason}")