HTTP Requests with urllib

Making Basic Requests

Python’s urllib.request module handles HTTP requests without external dependencies. The simplest request fetches a URL and reads the response:

import urllib.request

# Fetch a webpage
with urllib.request.urlopen('http://httpbin.org/html') as response:
    html_content = response.read()
    print(f"Status: {response.status}")
    print(f"Content length: {len(html_content)} bytes")

The urlopen() function returns a response object that works like a file. Always use with to ensure the connection closes properly.

Response Objects

The response object provides both data and metadata:

with urllib.request.urlopen('http://httpbin.org/json') as response:
    # Read response body
    data = response.read()  # Returns bytes
    
    # Response metadata
    status_code = response.status  # 200, 404, etc.
    headers = response.headers     # Dictionary-like object
    content_type = response.headers.get('Content-Type')
    
    # Convert bytes to string
    text = data.decode('utf-8')

Key response attributes:

.read() - Get response body as bytes
.status - HTTP status code
.headers - Response headers
.url - Final URL (after redirects)

Handling Timeouts

Network requests can hang indefinitely. Always set a timeout:

import urllib.request
import urllib.error

url = 'http://httpbin.org/delay/5'

try:
    # Timeout after 3 seconds
    with urllib.request.urlopen(url, timeout=3) as response:
        data = response.read()
except urllib.error.URLError as e:
    print(f"Request failed: {e}")

The timeout applies to the connection attempt, not the total download time.

Error Handling

Different errors require different handling:

import urllib.request
import urllib.error

def fetch_url(url):
    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            return response.read(), response.status
    
    except urllib.error.HTTPError as e:
        # HTTP errors (404, 500, etc.)
        print(f"HTTP Error {e.code}: {e.reason}")
        return None, e.code
    
    except urllib.error.URLError as e:
        # Network errors (DNS, connection refused)
        print(f"URL Error: {e.reason}")
        return None, None
    
    except Exception as e:
        # Other errors
        print(f"Unexpected error: {e}")
        return None, None

# Test with various URLs
data, status = fetch_url('http://httpbin.org/status/404')

Measuring Response Time

Track how long requests take:

import time
import urllib.request

def timed_request(url):
    start_time = time.time()
    
    try:
        with urllib.request.urlopen(url, timeout=10) as response:
            data = response.read()
            elapsed_ms = (time.time() - start_time) * 1000
            
            return {
                'status': response.status,
                'size': len(data),
                'time_ms': elapsed_ms
            }
    except Exception as e:
        elapsed_ms = (time.time() - start_time) * 1000
        return {
            'error': str(e),
            'time_ms': elapsed_ms
        }

# Example usage
result = timed_request('http://httpbin.org/delay/1')
print(f"Response time: {result['time_ms']:.2f} ms")

Working with Headers

Check response headers to determine content type:

def fetch_with_type_check(url):
    with urllib.request.urlopen(url) as response:
        content_type = response.headers.get('Content-Type', '')
        data = response.read()
        
        # Check if response is text
        if 'text' in content_type or 'json' in content_type:
            text = data.decode('utf-8')
            return text, 'text'
        else:
            return data, 'binary'

# Fetch and identify content
content, content_format = fetch_with_type_check('http://httpbin.org/json')
print(f"Content format: {content_format}")

Processing Multiple URLs

Pattern for fetching multiple URLs with error recovery:

import urllib.request
import urllib.error

def fetch_multiple(urls, timeout=10):
    results = []
    
    for url in urls:
        try:
            with urllib.request.urlopen(url, timeout=timeout) as response:
                results.append({
                    'url': url,
                    'status': response.status,
                    'size': len(response.read()),
                    'success': True
                })
        except (urllib.error.HTTPError, urllib.error.URLError) as e:
            results.append({
                'url': url,
                'error': str(e),
                'success': False
            })
    
    return results

# Process a list of URLs
urls = [
    'http://httpbin.org/status/200',
    'http://httpbin.org/status/404',
    'http://invalid.url.example'
]

results = fetch_multiple(urls)
for r in results:
    if r['success']:
        print(f"✓ {r['url']}: {r['status']}")
    else:
        print(f"✗ {r['url']}: {r['error']}")

Testing with httpbin.org

httpbin.org provides endpoints for testing HTTP operations:

# Common test endpoints
test_urls = {
    'success': 'http://httpbin.org/status/200',
    'not_found': 'http://httpbin.org/status/404',
    'server_error': 'http://httpbin.org/status/500',
    'delay': 'http://httpbin.org/delay/2',
    'json': 'http://httpbin.org/json',
    'html': 'http://httpbin.org/html',
    'redirect': 'http://httpbin.org/redirect/1'
}

# Test different scenarios
for name, url in test_urls.items():
    try:
        with urllib.request.urlopen(url, timeout=5) as response:
            print(f"{name}: Status {response.status}")
    except Exception as e:
        print(f"{name}: Error - {e}")

Complete Example: URL Fetcher

Putting it all together:

import urllib.request
import urllib.error
import json
import time
from datetime import datetime, timezone

def process_url_list(url_file_path, output_dir):
    """Fetch URLs and generate report."""
    
    # Read URLs from file
    with open(url_file_path, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]
    
    results = []
    
    for url in urls:
        start_time = time.time()
        
        try:
            with urllib.request.urlopen(url, timeout=10) as response:
                content = response.read()
                elapsed_ms = (time.time() - start_time) * 1000
                
                # Check if content is text
                content_type = response.headers.get('Content-Type', '')
                word_count = None
                if 'text' in content_type:
                    text = content.decode('utf-8', errors='ignore')
                    words = text.split()
                    word_count = len(words)
                
                results.append({
                    'url': url,
                    'status_code': response.status,
                    'response_time_ms': elapsed_ms,
                    'content_length': len(content),
                    'word_count': word_count,
                    'timestamp': datetime.now(timezone.utc).isoformat(),
                    'error': None
                })
                
        except Exception as e:
            elapsed_ms = (time.time() - start_time) * 1000
            results.append({
                'url': url,
                'status_code': None,
                'response_time_ms': elapsed_ms,
                'content_length': 0,
                'word_count': None,
                'timestamp': datetime.now(timezone.utc).isoformat(),
                'error': str(e)
            })
    
    # Save results
    output_path = f"{output_dir}/responses.json"
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    return results

Method Reference

# Basic operations
response = urllib.request.urlopen(url)           # Open URL
data = response.read()                           # Read bytes
text = data.decode('utf-8')                      # Convert to string
response.status                                  # Get status code
response.headers['Content-Type']                 # Get header

# With timeout
response = urllib.request.urlopen(url, timeout=10)

# Error handling
try:
    response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:              # HTTP errors
    print(f"HTTP {e.code}: {e.reason}")
except urllib.error.URLError as e:               # Network errors  
    print(f"Network error: {e.reason}")