Table Of Contents
- From Arrays to Insights
- Essential Aggregation Functions
- Axis-Specific Aggregations
- Advanced Aggregation Patterns
- Handling Missing Data in Aggregations
- Custom Aggregation Functions
- Performance Tips for Large Arrays
- Dive Deeper
From Arrays to Insights
Raw data tells a story, but aggregation functions help you hear it. NumPy's aggregation arsenal turns millions of numbers into meaningful statistics in microseconds.
Essential Aggregation Functions
import numpy as np
# Sample data
data = np.random.rand(1000, 50)
sales_data = np.array([[100, 150, 200],
[120, 180, 220],
[90, 160, 190]])
# Basic aggregations
print(f"Sum: {np.sum(data)}")
print(f"Mean: {np.mean(data)}")
print(f"Median: {np.median(data)}")
print(f"Standard deviation: {np.std(data)}")
print(f"Variance: {np.var(data)}")
# Min/Max operations
print(f"Minimum: {np.min(data)}")
print(f"Maximum: {np.max(data)}")
print(f"Range: {np.ptp(data)}") # Peak-to-peak (max - min)
# Position-based functions
print(f"Index of minimum: {np.argmin(data)}")
print(f"Index of maximum: {np.argmax(data)}")
# Percentiles and quantiles
print(f"25th percentile: {np.percentile(data, 25)}")
print(f"75th percentile: {np.percentile(data, 75)}")
print(f"Quartiles: {np.quantile(data, [0.25, 0.5, 0.75])}")
Axis-Specific Aggregations
# 2D aggregations along different axes
matrix = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
# Aggregate along rows (axis=0)
col_sums = np.sum(matrix, axis=0)
print(col_sums) # [12 15 18] - sum of each column
# Aggregate along columns (axis=1)
row_sums = np.sum(matrix, axis=1)
print(row_sums) # [ 6 15 24] - sum of each row
# Multi-dimensional example
data_3d = np.random.rand(10, 20, 30)
# Mean across first dimension
result = np.mean(data_3d, axis=0) # Shape: (20, 30)
# Multiple axes
overall_mean = np.mean(data_3d, axis=(0, 2)) # Shape: (20,)
Advanced Aggregation Patterns
# Conditional aggregations
data = np.array([1, -2, 3, -4, 5, -6])
# Count positive values
positive_count = np.sum(data > 0)
print(f"Positive values: {positive_count}")
# Average of positive values only
positive_avg = np.mean(data[data > 0])
print(f"Positive average: {positive_avg}")
# Weighted aggregations
values = np.array([1, 2, 3, 4, 5])
weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1])
weighted_avg = np.average(values, weights=weights)
print(f"Weighted average: {weighted_avg}")
# Cumulative operations
cumulative_sum = np.cumsum(values)
cumulative_product = np.cumprod(values)
print(f"Cumulative sum: {cumulative_sum}") # [ 1 3 6 10 15]
print(f"Cumulative product: {cumulative_product}") # [ 1 2 6 24 120]
Handling Missing Data in Aggregations
# Data with NaN values
messy_data = np.array([1.0, 2.0, np.nan, 4.0, np.nan, 6.0])
# Regular functions return NaN
print(f"Regular mean: {np.mean(messy_data)}") # nan
# NaN-aware functions ignore missing values
print(f"NaN-safe mean: {np.nanmean(messy_data)}") # 3.25
print(f"NaN-safe std: {np.nanstd(messy_data)}") # 2.06...
print(f"NaN-safe sum: {np.nansum(messy_data)}") # 13.0
Custom Aggregation Functions
# Using reduce for custom aggregations
def custom_aggregation(arr):
"""Calculate sum of squares"""
return np.sum(arr ** 2)
# Apply to different axes
data = np.array([[1, 2, 3], [4, 5, 6]])
result = np.apply_along_axis(custom_aggregation, axis=1, arr=data)
print(result) # [14 77] - sum of squares for each row
# Using universal functions
def root_mean_square(arr):
return np.sqrt(np.mean(arr ** 2))
rms_values = np.apply_along_axis(root_mean_square, axis=0, arr=data)
Performance Tips for Large Arrays
# For very large arrays, consider:
large_data = np.random.rand(10000, 1000)
# 1. Specify dtype for consistent results
result = np.sum(large_data, dtype=np.float64)
# 2. Use axis parameter to reduce memory usage
axis_result = np.mean(large_data, axis=0, keepdims=True)
# 3. Consider chunked processing for memory efficiency
def chunked_mean(arr, chunk_size=1000):
chunks = [arr[i:i+chunk_size] for i in range(0, len(arr), chunk_size)]
chunk_means = [np.mean(chunk) for chunk in chunks]
return np.mean(chunk_means)
Dive Deeper
Master statistical analysis techniques, explore data science workflows, and learn about advanced NumPy operations.
Share this article
Add Comment
No comments yet. Be the first to comment!