Working with JSON Data
What is JSON?
JSON (JavaScript Object Notation) represents structured data using basic types: objects (dictionaries), arrays (lists), strings, numbers, booleans, and null.
{
"name": "ResNet-50",
"parameters": 25600000,
"layers": ["conv", "batch_norm", "relu", "pool"],
"pretrained": true,
"metadata": {
"framework": "PyTorch",
"version": "1.12.0"
}
}JSON maps directly to Python data structures:
- JSON object → Python dict
- JSON array → Python list
- JSON string → Python str
- JSON number → Python int/float
- JSON boolean → Python bool
- JSON null → Python None
Python’s json Module
The json module converts between JSON strings and Python objects:
import json
# Parse JSON string to Python
json_string = '{"model": "GPT-4", "tokens": 8192, "active": true}'
data = json.loads(json_string)
print(data['model']) # 'GPT-4'
print(type(data)) # <class 'dict'>
# Convert Python to JSON string
python_data = {
'experiment': 'classification',
'accuracy': 0.943,
'epochs': 50
}
json_output = json.dumps(python_data)
print(json_output) # '{"experiment": "classification", ...}'Key functions:
json.loads()- Parse JSON stringjson.dumps()- Create JSON stringjson.load()- Parse JSON from filejson.dump()- Write JSON to file
Reading and Writing JSON Files
Working with JSON files:
import json
# Read JSON from file
with open('data.json', 'r') as f:
data = json.load(f)
# Write JSON to file
output_data = {
'results': [1, 2, 3],
'timestamp': '2024-01-15T10:30:00Z'
}
with open('output.json', 'w') as f:
json.dump(output_data, f, indent=2)The indent parameter creates readable formatted output:
# Compact (no indent)
json.dumps(data)
# {"name": "test", "values": [1, 2, 3]}
# Formatted (indent=2)
json.dumps(data, indent=2)
# {
# "name": "test",
# "values": [1, 2, 3]
# }Handling Missing Keys
Use .get() to handle optional fields safely:
def extract_paper_info(paper_dict):
"""Extract info with defaults for missing fields."""
return {
'id': paper_dict.get('id', 'unknown'),
'title': paper_dict.get('title', 'Untitled'),
'year': paper_dict.get('year', None),
'author_count': len(paper_dict.get('authors', [])),
'abstract': paper_dict.get('abstract', '')
}
# Works even with missing fields
minimal_paper = {'id': '123', 'title': 'Test'}
info = extract_paper_info(minimal_paper)
# {'id': '123', 'title': 'Test', 'year': None, 'author_count': 0, ...}Building JSON Structures
Create complex JSON programmatically:
import json
from datetime import datetime, timezone
def create_response(urls, results):
"""Build structured JSON response."""
response = {
'metadata': {
'total_urls': len(urls),
'successful': sum(1 for r in results if r['success']),
'failed': sum(1 for r in results if not r['success']),
'timestamp': datetime.now(timezone.utc).isoformat()
},
'results': []
}
for url, result in zip(urls, results):
response['results'].append({
'url': url,
'status': result.get('status'),
'size_bytes': result.get('size', 0),
'error': result.get('error')
})
return response
# Generate and save
data = create_response(urls, fetch_results)
with open('report.json', 'w') as f:
json.dump(data, f, indent=2)Processing JSON Arrays
Common patterns for working with JSON arrays:
# Load array of objects
with open('papers.json') as f:
papers = json.load(f)
# Filter and transform
ml_papers = [
{
'id': p['id'],
'title': p['title'],
'year': p.get('year', 'unknown')
}
for p in papers
if 'machine learning' in p.get('abstract', '').lower()
]
# Aggregate statistics
total_citations = sum(p.get('citations', 0) for p in papers)
papers_with_code = [p for p in papers if p.get('has_code', False)]
# Group by category
from collections import defaultdict
by_category = defaultdict(list)
for paper in papers:
for cat in paper.get('categories', []):
by_category[cat].append(paper['id'])Validation and Error Handling
Handle malformed JSON gracefully:
import json
def parse_json_safely(json_string):
"""Parse JSON with error handling."""
try:
data = json.loads(json_string)
return data, None
except json.JSONDecodeError as e:
return None, f"Invalid JSON at line {e.lineno}, column {e.colno}: {e.msg}"
except Exception as e:
return None, f"Unexpected error: {e}"
# Test with various inputs
valid = '{"key": "value"}'
invalid = '{"key": value}' # Missing quotes
malformed = '{"key": "value"' # Unclosed brace
for test_input in [valid, invalid, malformed]:
data, error = parse_json_safely(test_input)
if error:
print(f"Error: {error}")
else:
print(f"Parsed: {data}")Working with Large JSON Files
For memory-efficient processing of large JSON arrays:
import json
def process_json_stream(filepath, process_func):
"""Process JSON array items one at a time."""
with open(filepath, 'r') as f:
# Read opening bracket
char = f.read(1)
if char != '[':
raise ValueError("Expected JSON array")
decoder = json.JSONDecoder()
buffer = ''
while True:
char = f.read(1)
if not char:
break
buffer += char
if char in [',', ']']:
# Try to parse accumulated buffer
buffer = buffer.strip().rstrip(',').rstrip(']')
if buffer:
try:
obj = decoder.decode(buffer)
process_func(obj)
buffer = ''
except json.JSONDecodeError:
continue
if char == ']':
breakCommand-Line JSON with jq
The jq tool processes JSON from the command line:
# Pretty-print JSON
cat data.json | jq '.'
# Extract specific field
cat papers.json | jq '.results[0].title'
# Filter array
cat papers.json | jq '.results[] | select(.year == 2024)'
# Extract multiple fields
cat papers.json | jq '.results[] | {title: .title, id: .id}'
# Count array items
cat papers.json | jq '.results | length'
# Get all unique categories
cat papers.json | jq '[.results[].categories[]] | unique'Python equivalent of common jq operations:
import json
# Load data
with open('papers.json') as f:
data = json.load(f)
# jq: '.results[0].title'
title = data['results'][0]['title']
# jq: '.results[] | select(.year == 2024)'
papers_2024 = [p for p in data['results'] if p.get('year') == 2024]
# jq: '.results | length'
count = len(data['results'])
# jq: '[.results[].categories[]] | unique'
all_categories = set()
for paper in data['results']:
all_categories.update(paper.get('categories', []))
unique_categories = sorted(all_categories)Complete Example: Processing API Response
import json
import urllib.request
from datetime import datetime, timezone
def fetch_and_process_arxiv(query, max_results=10):
"""Fetch ArXiv data and convert to JSON."""
# This would normally parse XML, simplified here
papers = [] # ... fetch and parse ...
# Build JSON structure
output = {
'query': query,
'fetched_at': datetime.now(timezone.utc).isoformat(),
'total_results': len(papers),
'papers': []
}
for paper in papers:
output['papers'].append({
'id': paper.get('id'),
'title': paper.get('title'),
'authors': paper.get('authors', []),
'abstract': paper.get('abstract'),
'categories': paper.get('categories', []),
'metrics': {
'abstract_length': len(paper.get('abstract', '')),
'author_count': len(paper.get('authors', [])),
'category_count': len(paper.get('categories', []))
}
})
return output
# Save results
results = fetch_and_process_arxiv('cat:cs.LG', max_results=5)
with open('arxiv_results.json', 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)Method Reference
# Parsing
data = json.loads(string) # String to Python
data = json.load(file) # File to Python
# Serializing
string = json.dumps(obj) # Python to string
json.dump(obj, file) # Python to file
# Formatting options
json.dumps(obj, indent=2) # Pretty-print
json.dumps(obj, sort_keys=True) # Sort keys
json.dumps(obj, ensure_ascii=False) # Allow Unicode
# Safe access
value = dict.get('key', default) # With default
value = dict.get('key', {}).get('nested') # Nested with default
# jq basics (command line)
jq '.' # Pretty-print
jq '.field' # Extract field
jq '.array[]' # Array elements
jq 'length' # Count items