Table Of Contents
- Big Data, Smart Memory
- Memory Mapping for Huge Arrays
- Chunked Processing for Large Data
- Data Type Optimization
- Lazy Loading and Views
- Streaming Data Processing
- Memory Monitoring
- Best Practices for Large Arrays
- Scale Your Data Processing
Big Data, Smart Memory
When arrays grow beyond your RAM, traditional approaches crash. NumPy's memory-efficient techniques let you handle massive datasets with surgical precision.
Memory Mapping for Huge Arrays
import numpy as np
# Create a memory-mapped array (doesn't load into RAM immediately)
# Shape: 10,000 x 1,000 = 10 million float64 values (~76MB)
mmap_array = np.memmap('huge_data.dat', dtype='float64', mode='w+',
shape=(10000, 1000))
# Fill with data (only loads/modifies pages as needed)
mmap_array[0, :100] = np.random.rand(100) # Only loads this page
mmap_array[5000:5010, :] = 42 # Only loads these pages
# Access specific parts without loading everything
first_row_mean = mmap_array[0, :].mean() # Loads only first row
print(f"First row mean: {first_row_mean}")
# Memory-mapped arrays act like regular arrays
subset = mmap_array[100:200, 200:300] # 100x100 slice
result = np.sum(subset) # Computation on subset
# Explicitly flush changes to disk
del mmap_array # or mmap_array.flush()
Chunked Processing for Large Data
def process_large_file_in_chunks(filename, chunk_size=1000):
"""Process large arrays in memory-efficient chunks"""
# Load metadata without loading data
mmap_data = np.memmap(filename, dtype='float64', mode='r')
total_size = mmap_data.size
results = []
for start in range(0, total_size, chunk_size):
end = min(start + chunk_size, total_size)
# Load only current chunk
chunk = mmap_data[start:end]
# Process chunk (example: statistical analysis)
chunk_stats = {
'mean': chunk.mean(),
'std': chunk.std(),
'min': chunk.min(),
'max': chunk.max()
}
results.append(chunk_stats)
# Chunk is automatically garbage collected
return results
# Usage example
# stats = process_large_file_in_chunks('massive_dataset.dat')
Data Type Optimization
# Memory usage comparison
size = 1000000
# Default int64 uses 8 bytes per element
int64_array = np.arange(size, dtype=np.int64)
print(f"int64 memory: {int64_array.nbytes / 1024 / 1024:.1f} MB")
# int32 uses 4 bytes per element (50% savings)
int32_array = np.arange(size, dtype=np.int32)
print(f"int32 memory: {int32_array.nbytes / 1024 / 1024:.1f} MB")
# int16 uses 2 bytes per element (75% savings)
int16_array = np.arange(size, dtype=np.int16)
print(f"int16 memory: {int16_array.nbytes / 1024 / 1024:.1f} MB")
# Choose the smallest type that fits your data range
temperature_data = np.random.randint(-50, 50, 1000000)
# int8 range: -128 to 127 (perfect for temperatures)
temp_optimized = temperature_data.astype(np.int8) # 87.5% memory savings!
Lazy Loading and Views
# Create large base array
large_array = np.random.rand(10000, 10000) # ~762 MB
# Views don't copy data (memory efficient)
subarray = large_array[1000:2000, 2000:3000] # View, not copy
transposed = large_array.T # View, not copy
reshaped = large_array.reshape(100000, 1000) # View, not copy
print(f"Original shares memory with view: {np.shares_memory(large_array, subarray)}")
# Operations that force copying (memory expensive)
copied = large_array.copy() # Full copy
modified = large_array + 1 # New array
# Force view when possible
def safe_slice(arr, start_row, end_row):
"""Return memory-efficient slice"""
return arr[start_row:end_row] # Returns view, not copy
# Use strides for memory-efficient access patterns
strided_view = np.lib.stride_tricks.sliding_window_view(
large_array[0], window_shape=100
) # Memory-efficient sliding windows
Streaming Data Processing
class ArrayStreamer:
"""Process arrays too large for memory in streaming fashion"""
def __init__(self, filename, dtype, shape, chunk_size=1000):
self.mmap = np.memmap(filename, dtype=dtype, mode='r', shape=shape)
self.chunk_size = chunk_size
self.shape = shape
def stream_chunks(self):
"""Generator that yields chunks without loading full array"""
rows, cols = self.shape
for start_row in range(0, rows, self.chunk_size):
end_row = min(start_row + self.chunk_size, rows)
yield self.mmap[start_row:end_row]
def compute_statistics(self):
"""Compute statistics without loading full array"""
n_samples = 0
running_sum = 0
running_sum_sq = 0
min_val = float('inf')
max_val = float('-inf')
for chunk in self.stream_chunks():
n_samples += chunk.size
running_sum += chunk.sum()
running_sum_sq += (chunk ** 2).sum()
min_val = min(min_val, chunk.min())
max_val = max(max_val, chunk.max())
mean = running_sum / n_samples
variance = (running_sum_sq / n_samples) - (mean ** 2)
std = np.sqrt(variance)
return {
'mean': mean,
'std': std,
'min': min_val,
'max': max_val,
'count': n_samples
}
# Usage
# streamer = ArrayStreamer('huge_file.dat', np.float64, (100000, 1000))
# stats = streamer.compute_statistics()
Memory Monitoring
import psutil
import os
def monitor_memory_usage():
"""Monitor current memory usage"""
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
return {
'rss': memory_info.rss / 1024 / 1024, # MB
'vms': memory_info.vms / 1024 / 1024 # MB
}
# Before creating large array
before = monitor_memory_usage()
print(f"Memory before: {before['rss']:.1f} MB")
# Create memory-mapped array (minimal memory impact)
mmap_array = np.memmap('test.dat', dtype='float64', mode='w+',
shape=(10000, 1000))
after = monitor_memory_usage()
print(f"Memory after mmap: {after['rss']:.1f} MB")
print(f"Memory increase: {after['rss'] - before['rss']:.1f} MB")
Best Practices for Large Arrays
- Use memory mapping for arrays larger than available RAM
- Optimize data types - use smallest type that fits your data
- Process in chunks rather than loading entire arrays
- Prefer views over copies when possible
- Monitor memory usage to detect memory leaks
Scale Your Data Processing
Master distributed computing with Dask, explore parallel processing techniques, and learn high-performance computing strategies.
Share this article
Add Comment
No comments yet. Be the first to comment!