Working with JSON Data

What is JSON?

JSON (JavaScript Object Notation) represents structured data using basic types: objects (dictionaries), arrays (lists), strings, numbers, booleans, and null.

{
  "name": "ResNet-50",
  "parameters": 25600000,
  "layers": ["conv", "batch_norm", "relu", "pool"],
  "pretrained": true,
  "metadata": {
    "framework": "PyTorch",
    "version": "1.12.0"
  }
}

JSON maps directly to Python data structures:

  • JSON object → Python dict
  • JSON array → Python list
  • JSON string → Python str
  • JSON number → Python int/float
  • JSON boolean → Python bool
  • JSON null → Python None

Python’s json Module

The json module converts between JSON strings and Python objects:

import json

# Parse JSON string to Python
json_string = '{"model": "GPT-4", "tokens": 8192, "active": true}'
data = json.loads(json_string)
print(data['model'])  # 'GPT-4'
print(type(data))     # <class 'dict'>

# Convert Python to JSON string
python_data = {
    'experiment': 'classification',
    'accuracy': 0.943,
    'epochs': 50
}
json_output = json.dumps(python_data)
print(json_output)  # '{"experiment": "classification", ...}'

Key functions:

  • json.loads() - Parse JSON string
  • json.dumps() - Create JSON string
  • json.load() - Parse JSON from file
  • json.dump() - Write JSON to file

Reading and Writing JSON Files

Working with JSON files:

import json

# Read JSON from file
with open('data.json', 'r') as f:
    data = json.load(f)

# Write JSON to file
output_data = {
    'results': [1, 2, 3],
    'timestamp': '2024-01-15T10:30:00Z'
}

with open('output.json', 'w') as f:
    json.dump(output_data, f, indent=2)

The indent parameter creates readable formatted output:

# Compact (no indent)
json.dumps(data)
# {"name": "test", "values": [1, 2, 3]}

# Formatted (indent=2)
json.dumps(data, indent=2)
# {
#   "name": "test",
#   "values": [1, 2, 3]
# }

Handling Missing Keys

Use .get() to handle optional fields safely:

def extract_paper_info(paper_dict):
    """Extract info with defaults for missing fields."""
    
    return {
        'id': paper_dict.get('id', 'unknown'),
        'title': paper_dict.get('title', 'Untitled'),
        'year': paper_dict.get('year', None),
        'author_count': len(paper_dict.get('authors', [])),
        'abstract': paper_dict.get('abstract', '')
    }

# Works even with missing fields
minimal_paper = {'id': '123', 'title': 'Test'}
info = extract_paper_info(minimal_paper)
# {'id': '123', 'title': 'Test', 'year': None, 'author_count': 0, ...}

Building JSON Structures

Create complex JSON programmatically:

import json
from datetime import datetime, timezone

def create_response(urls, results):
    """Build structured JSON response."""
    
    response = {
        'metadata': {
            'total_urls': len(urls),
            'successful': sum(1 for r in results if r['success']),
            'failed': sum(1 for r in results if not r['success']),
            'timestamp': datetime.now(timezone.utc).isoformat()
        },
        'results': []
    }
    
    for url, result in zip(urls, results):
        response['results'].append({
            'url': url,
            'status': result.get('status'),
            'size_bytes': result.get('size', 0),
            'error': result.get('error')
        })
    
    return response

# Generate and save
data = create_response(urls, fetch_results)
with open('report.json', 'w') as f:
    json.dump(data, f, indent=2)

Processing JSON Arrays

Common patterns for working with JSON arrays:

# Load array of objects
with open('papers.json') as f:
    papers = json.load(f)

# Filter and transform
ml_papers = [
    {
        'id': p['id'],
        'title': p['title'],
        'year': p.get('year', 'unknown')
    }
    for p in papers
    if 'machine learning' in p.get('abstract', '').lower()
]

# Aggregate statistics
total_citations = sum(p.get('citations', 0) for p in papers)
papers_with_code = [p for p in papers if p.get('has_code', False)]

# Group by category
from collections import defaultdict

by_category = defaultdict(list)
for paper in papers:
    for cat in paper.get('categories', []):
        by_category[cat].append(paper['id'])

Validation and Error Handling

Handle malformed JSON gracefully:

import json

def parse_json_safely(json_string):
    """Parse JSON with error handling."""
    
    try:
        data = json.loads(json_string)
        return data, None
    except json.JSONDecodeError as e:
        return None, f"Invalid JSON at line {e.lineno}, column {e.colno}: {e.msg}"
    except Exception as e:
        return None, f"Unexpected error: {e}"

# Test with various inputs
valid = '{"key": "value"}'
invalid = '{"key": value}'  # Missing quotes
malformed = '{"key": "value"'  # Unclosed brace

for test_input in [valid, invalid, malformed]:
    data, error = parse_json_safely(test_input)
    if error:
        print(f"Error: {error}")
    else:
        print(f"Parsed: {data}")

Working with Large JSON Files

For memory-efficient processing of large JSON arrays:

import json

def process_json_stream(filepath, process_func):
    """Process JSON array items one at a time."""
    
    with open(filepath, 'r') as f:
        # Read opening bracket
        char = f.read(1)
        if char != '[':
            raise ValueError("Expected JSON array")
        
        decoder = json.JSONDecoder()
        buffer = ''
        
        while True:
            char = f.read(1)
            if not char:
                break
                
            buffer += char
            
            if char in [',', ']']:
                # Try to parse accumulated buffer
                buffer = buffer.strip().rstrip(',').rstrip(']')
                if buffer:
                    try:
                        obj = decoder.decode(buffer)
                        process_func(obj)
                        buffer = ''
                    except json.JSONDecodeError:
                        continue
                        
                if char == ']':
                    break

Command-Line JSON with jq

The jq tool processes JSON from the command line:

# Pretty-print JSON
cat data.json | jq '.'

# Extract specific field
cat papers.json | jq '.results[0].title'

# Filter array
cat papers.json | jq '.results[] | select(.year == 2024)'

# Extract multiple fields
cat papers.json | jq '.results[] | {title: .title, id: .id}'

# Count array items
cat papers.json | jq '.results | length'

# Get all unique categories
cat papers.json | jq '[.results[].categories[]] | unique'

Python equivalent of common jq operations:

import json

# Load data
with open('papers.json') as f:
    data = json.load(f)

# jq: '.results[0].title'
title = data['results'][0]['title']

# jq: '.results[] | select(.year == 2024)'
papers_2024 = [p for p in data['results'] if p.get('year') == 2024]

# jq: '.results | length'
count = len(data['results'])

# jq: '[.results[].categories[]] | unique'
all_categories = set()
for paper in data['results']:
    all_categories.update(paper.get('categories', []))
unique_categories = sorted(all_categories)

Complete Example: Processing API Response

import json
import urllib.request
from datetime import datetime, timezone

def fetch_and_process_arxiv(query, max_results=10):
    """Fetch ArXiv data and convert to JSON."""
    
    # This would normally parse XML, simplified here
    papers = []  # ... fetch and parse ...
    
    # Build JSON structure
    output = {
        'query': query,
        'fetched_at': datetime.now(timezone.utc).isoformat(),
        'total_results': len(papers),
        'papers': []
    }
    
    for paper in papers:
        output['papers'].append({
            'id': paper.get('id'),
            'title': paper.get('title'),
            'authors': paper.get('authors', []),
            'abstract': paper.get('abstract'),
            'categories': paper.get('categories', []),
            'metrics': {
                'abstract_length': len(paper.get('abstract', '')),
                'author_count': len(paper.get('authors', [])),
                'category_count': len(paper.get('categories', []))
            }
        })
    
    return output

# Save results
results = fetch_and_process_arxiv('cat:cs.LG', max_results=5)
with open('arxiv_results.json', 'w') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

Method Reference

# Parsing
data = json.loads(string)                # String to Python
data = json.load(file)                   # File to Python

# Serializing  
string = json.dumps(obj)                 # Python to string
json.dump(obj, file)                     # Python to file

# Formatting options
json.dumps(obj, indent=2)                # Pretty-print
json.dumps(obj, sort_keys=True)          # Sort keys
json.dumps(obj, ensure_ascii=False)      # Allow Unicode

# Safe access
value = dict.get('key', default)         # With default
value = dict.get('key', {}).get('nested') # Nested with default

# jq basics (command line)
jq '.'                                    # Pretty-print
jq '.field'                               # Extract field
jq '.array[]'                             # Array elements
jq 'length'                               # Count items