Navigation

Python

How to Handle Large Arrays Memory Efficiently

Conquer memory limitations with NumPy's memory-efficient techniques - process massive datasets without breaking your system

Table Of Contents

Big Data, Smart Memory

When arrays grow beyond your RAM, traditional approaches crash. NumPy's memory-efficient techniques let you handle massive datasets with surgical precision.

Memory Mapping for Huge Arrays

import numpy as np

# Create a memory-mapped array (doesn't load into RAM immediately)
# Shape: 10,000 x 1,000 = 10 million float64 values (~76MB)
mmap_array = np.memmap('huge_data.dat', dtype='float64', mode='w+', 
                       shape=(10000, 1000))

# Fill with data (only loads/modifies pages as needed)
mmap_array[0, :100] = np.random.rand(100)  # Only loads this page
mmap_array[5000:5010, :] = 42              # Only loads these pages

# Access specific parts without loading everything
first_row_mean = mmap_array[0, :].mean()   # Loads only first row
print(f"First row mean: {first_row_mean}")

# Memory-mapped arrays act like regular arrays
subset = mmap_array[100:200, 200:300]      # 100x100 slice
result = np.sum(subset)                    # Computation on subset

# Explicitly flush changes to disk
del mmap_array  # or mmap_array.flush()

Chunked Processing for Large Data

def process_large_file_in_chunks(filename, chunk_size=1000):
    """Process large arrays in memory-efficient chunks"""
    
    # Load metadata without loading data
    mmap_data = np.memmap(filename, dtype='float64', mode='r')
    total_size = mmap_data.size
    
    results = []
    for start in range(0, total_size, chunk_size):
        end = min(start + chunk_size, total_size)
        
        # Load only current chunk
        chunk = mmap_data[start:end]
        
        # Process chunk (example: statistical analysis)
        chunk_stats = {
            'mean': chunk.mean(),
            'std': chunk.std(),
            'min': chunk.min(),
            'max': chunk.max()
        }
        results.append(chunk_stats)
        
        # Chunk is automatically garbage collected
        
    return results

# Usage example
# stats = process_large_file_in_chunks('massive_dataset.dat')

Data Type Optimization

# Memory usage comparison
size = 1000000

# Default int64 uses 8 bytes per element
int64_array = np.arange(size, dtype=np.int64)
print(f"int64 memory: {int64_array.nbytes / 1024 / 1024:.1f} MB")

# int32 uses 4 bytes per element (50% savings)
int32_array = np.arange(size, dtype=np.int32)
print(f"int32 memory: {int32_array.nbytes / 1024 / 1024:.1f} MB")

# int16 uses 2 bytes per element (75% savings)
int16_array = np.arange(size, dtype=np.int16)
print(f"int16 memory: {int16_array.nbytes / 1024 / 1024:.1f} MB")

# Choose the smallest type that fits your data range
temperature_data = np.random.randint(-50, 50, 1000000)
# int8 range: -128 to 127 (perfect for temperatures)
temp_optimized = temperature_data.astype(np.int8)  # 87.5% memory savings!

Lazy Loading and Views

# Create large base array
large_array = np.random.rand(10000, 10000)  # ~762 MB

# Views don't copy data (memory efficient)
subarray = large_array[1000:2000, 2000:3000]  # View, not copy
transposed = large_array.T                     # View, not copy
reshaped = large_array.reshape(100000, 1000)   # View, not copy

print(f"Original shares memory with view: {np.shares_memory(large_array, subarray)}")

# Operations that force copying (memory expensive)
copied = large_array.copy()                    # Full copy
modified = large_array + 1                     # New array

# Force view when possible
def safe_slice(arr, start_row, end_row):
    """Return memory-efficient slice"""
    return arr[start_row:end_row]  # Returns view, not copy

# Use strides for memory-efficient access patterns
strided_view = np.lib.stride_tricks.sliding_window_view(
    large_array[0], window_shape=100
)  # Memory-efficient sliding windows

Streaming Data Processing

class ArrayStreamer:
    """Process arrays too large for memory in streaming fashion"""
    
    def __init__(self, filename, dtype, shape, chunk_size=1000):
        self.mmap = np.memmap(filename, dtype=dtype, mode='r', shape=shape)
        self.chunk_size = chunk_size
        self.shape = shape
    
    def stream_chunks(self):
        """Generator that yields chunks without loading full array"""
        rows, cols = self.shape
        
        for start_row in range(0, rows, self.chunk_size):
            end_row = min(start_row + self.chunk_size, rows)
            yield self.mmap[start_row:end_row]
    
    def compute_statistics(self):
        """Compute statistics without loading full array"""
        n_samples = 0
        running_sum = 0
        running_sum_sq = 0
        min_val = float('inf')
        max_val = float('-inf')
        
        for chunk in self.stream_chunks():
            n_samples += chunk.size
            running_sum += chunk.sum()
            running_sum_sq += (chunk ** 2).sum()
            min_val = min(min_val, chunk.min())
            max_val = max(max_val, chunk.max())
        
        mean = running_sum / n_samples
        variance = (running_sum_sq / n_samples) - (mean ** 2)
        std = np.sqrt(variance)
        
        return {
            'mean': mean,
            'std': std,
            'min': min_val,
            'max': max_val,
            'count': n_samples
        }

# Usage
# streamer = ArrayStreamer('huge_file.dat', np.float64, (100000, 1000))
# stats = streamer.compute_statistics()

Memory Monitoring

import psutil
import os

def monitor_memory_usage():
    """Monitor current memory usage"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    return {
        'rss': memory_info.rss / 1024 / 1024,  # MB
        'vms': memory_info.vms / 1024 / 1024   # MB
    }

# Before creating large array
before = monitor_memory_usage()
print(f"Memory before: {before['rss']:.1f} MB")

# Create memory-mapped array (minimal memory impact)
mmap_array = np.memmap('test.dat', dtype='float64', mode='w+', 
                       shape=(10000, 1000))

after = monitor_memory_usage()
print(f"Memory after mmap: {after['rss']:.1f} MB")
print(f"Memory increase: {after['rss'] - before['rss']:.1f} MB")

Best Practices for Large Arrays

  1. Use memory mapping for arrays larger than available RAM
  2. Optimize data types - use smallest type that fits your data
  3. Process in chunks rather than loading entire arrays
  4. Prefer views over copies when possible
  5. Monitor memory usage to detect memory leaks

Scale Your Data Processing

Master distributed computing with Dask, explore parallel processing techniques, and learn high-performance computing strategies.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python