Table Of Contents
- Data Gaps, No Problem
- Creating and Working with Masked Arrays
- Advanced Masking Techniques
- Operations with Masked Arrays
- Working with 2D Masked Arrays
- Data Quality and Validation
- Real-World Applications
- Performance Benefits
- Master Data Quality
Data Gaps, No Problem
Real-world data has holes, outliers, and invalid values. NumPy's masked arrays let you work with imperfect data without corrupting your calculations.
Creating and Working with Masked Arrays
import numpy as np
import numpy.ma as ma
# Create a masked array from data with invalid values
data = np.array([1, 2, -999, 4, 5, -999, 7])
masked_data = ma.masked_equal(data, -999) # Mask -999 values
print(masked_data) # [1 2 -- 4 5 -- 7]
# Create mask manually
normal_array = np.array([1.5, 2.3, 3.7, 4.1, 5.9])
mask = np.array([False, False, True, False, True]) # True = masked
masked_array = ma.masked_array(normal_array, mask=mask)
print(masked_array) # [1.5 2.3 -- 4.1 --]
# Mask based on conditions
sensor_data = np.array([22.1, -50.0, 23.5, 150.0, 21.8])
valid_data = ma.masked_outside(sensor_data, 0, 100) # Mask outside range
print(valid_data) # [22.1 -- 23.5 -- 21.8]
Advanced Masking Techniques
# Multiple masking conditions
temperatures = np.array([15, 25, -10, 35, 28, 200, 22])
# Mask values outside reasonable temperature range
reasonable_temps = ma.masked_where(
(temperatures < -5) | (temperatures > 50),
temperatures
)
print(reasonable_temps) # [15 25 -- 35 28 -- 22]
# Mask invalid values (NaN, inf)
messy_data = np.array([1.0, np.nan, 3.0, np.inf, 5.0, -np.inf])
clean_data = ma.masked_invalid(messy_data)
print(clean_data) # [1.0 -- 3.0 -- 5.0 --]
# Mask based on another array
quality_flags = np.array([1, 0, 1, 0, 1]) # 0 = bad quality
measurements = np.array([10.5, 11.2, 12.1, 13.8, 14.2])
good_measurements = ma.masked_where(quality_flags == 0, measurements)
print(good_measurements) # [10.5 -- 12.1 -- 14.2]
Operations with Masked Arrays
# Mathematical operations ignore masked values
data = ma.array([1, 2, 3, 4, 5], mask=[0, 0, 1, 0, 0])
print(f"Mean: {data.mean()}") # 3.0 (excludes masked value)
print(f"Sum: {data.sum()}") # 12 (excludes masked value)
print(f"Std: {data.std()}") # 1.58... (excludes masked value)
# Count valid (non-masked) values
print(f"Valid count: {data.count()}") # 4
# Fill masked values for computation
filled_data = data.filled(0) # Replace masked with 0
print(filled_data) # [1 2 0 4 5]
# Compressed array (removes masked values)
compressed = data.compressed()
print(compressed) # [1 2 4 5]
Working with 2D Masked Arrays
# 2D masked array
matrix = np.array([[1, 2, 3],
[4, -999, 6],
[7, 8, -999]])
masked_matrix = ma.masked_equal(matrix, -999)
# Operations along axes
row_means = masked_matrix.mean(axis=1) # Mean of each row
col_sums = masked_matrix.sum(axis=0) # Sum of each column
print(f"Row means: {row_means}") # [2.0 5.0 7.5]
print(f"Column sums: {col_sums}") # [12 10 9]
# Mask entire rows or columns
mask_2d = np.array([[False, False, False],
[True, True, True ], # Mask entire row
[False, False, False]])
selective_mask = ma.masked_array(matrix, mask=mask_2d)
print(selective_mask)
Data Quality and Validation
# Check mask properties
data = ma.array([1, 2, 3, 4, 5], mask=[0, 1, 0, 1, 0])
print(f"Is masked: {data.mask}") # [False True False True False]
print(f"Valid data: {~data.mask}") # [True False True False True]
print(f"Has masked values: {data.mask.any()}") # True
# Get masked and unmasked portions
valid_values = data[~data.mask]
masked_positions = np.where(data.mask)[0]
print(f"Valid values: {valid_values}") # [1 3 5]
print(f"Masked at indices: {masked_positions}") # [1 3]
Real-World Applications
# Weather station data with equipment failures
station_data = {
'temperature': [22.1, -999, 23.5, 21.8, -999],
'humidity': [65, 70, -999, 68, 72],
'pressure': [1013, 1015, 1012, -999, 1014]
}
# Create masked arrays for each measurement
masked_weather = {}
for measurement, values in station_data.items():
masked_weather[measurement] = ma.masked_equal(np.array(values), -999)
# Calculate daily averages excluding failures
daily_averages = {}
for measurement, data in masked_weather.items():
daily_averages[measurement] = data.mean()
print("Daily averages (excluding sensor failures):")
for measurement, avg in daily_averages.items():
print(f"{measurement}: {avg:.1f}")
Performance Benefits
- Calculations automatically skip masked values
- Memory efficient (mask is boolean array)
- Preserves original data structure
- Compatible with regular NumPy functions
Master Data Quality
Explore data cleaning techniques, learn statistical analysis with missing data, and master robust data processing workflows.
Share this article
Add Comment
No comments yet. Be the first to comment!