Navigation

Python

How to Use NumPy Aggregation Functions

Transform arrays of data into meaningful insights with NumPy's powerful aggregation functions that compute statistics along any dimension.

Table Of Contents

From Arrays to Insights

Raw data tells a story, but aggregation functions help you hear it. NumPy's aggregation arsenal turns millions of numbers into meaningful statistics in microseconds.

Essential Aggregation Functions

import numpy as np

# Sample data
data = np.random.rand(1000, 50)
sales_data = np.array([[100, 150, 200], 
                       [120, 180, 220], 
                       [90, 160, 190]])

# Basic aggregations
print(f"Sum: {np.sum(data)}")
print(f"Mean: {np.mean(data)}")
print(f"Median: {np.median(data)}")
print(f"Standard deviation: {np.std(data)}")
print(f"Variance: {np.var(data)}")

# Min/Max operations
print(f"Minimum: {np.min(data)}")
print(f"Maximum: {np.max(data)}")
print(f"Range: {np.ptp(data)}")  # Peak-to-peak (max - min)

# Position-based functions
print(f"Index of minimum: {np.argmin(data)}")
print(f"Index of maximum: {np.argmax(data)}")

# Percentiles and quantiles
print(f"25th percentile: {np.percentile(data, 25)}")
print(f"75th percentile: {np.percentile(data, 75)}")
print(f"Quartiles: {np.quantile(data, [0.25, 0.5, 0.75])}")

Axis-Specific Aggregations

# 2D aggregations along different axes
matrix = np.array([[1, 2, 3], 
                   [4, 5, 6], 
                   [7, 8, 9]])

# Aggregate along rows (axis=0)
col_sums = np.sum(matrix, axis=0)
print(col_sums)  # [12 15 18] - sum of each column

# Aggregate along columns (axis=1)
row_sums = np.sum(matrix, axis=1)
print(row_sums)  # [ 6 15 24] - sum of each row

# Multi-dimensional example
data_3d = np.random.rand(10, 20, 30)
# Mean across first dimension
result = np.mean(data_3d, axis=0)  # Shape: (20, 30)

# Multiple axes
overall_mean = np.mean(data_3d, axis=(0, 2))  # Shape: (20,)

Advanced Aggregation Patterns

# Conditional aggregations
data = np.array([1, -2, 3, -4, 5, -6])

# Count positive values
positive_count = np.sum(data > 0)
print(f"Positive values: {positive_count}")

# Average of positive values only
positive_avg = np.mean(data[data > 0])
print(f"Positive average: {positive_avg}")

# Weighted aggregations
values = np.array([1, 2, 3, 4, 5])
weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1])
weighted_avg = np.average(values, weights=weights)
print(f"Weighted average: {weighted_avg}")

# Cumulative operations
cumulative_sum = np.cumsum(values)
cumulative_product = np.cumprod(values)
print(f"Cumulative sum: {cumulative_sum}")      # [ 1  3  6 10 15]
print(f"Cumulative product: {cumulative_product}")  # [  1   2   6  24 120]

Handling Missing Data in Aggregations

# Data with NaN values
messy_data = np.array([1.0, 2.0, np.nan, 4.0, np.nan, 6.0])

# Regular functions return NaN
print(f"Regular mean: {np.mean(messy_data)}")    # nan

# NaN-aware functions ignore missing values
print(f"NaN-safe mean: {np.nanmean(messy_data)}")    # 3.25
print(f"NaN-safe std: {np.nanstd(messy_data)}")      # 2.06...
print(f"NaN-safe sum: {np.nansum(messy_data)}")      # 13.0

Custom Aggregation Functions

# Using reduce for custom aggregations
def custom_aggregation(arr):
    """Calculate sum of squares"""
    return np.sum(arr ** 2)

# Apply to different axes
data = np.array([[1, 2, 3], [4, 5, 6]])
result = np.apply_along_axis(custom_aggregation, axis=1, arr=data)
print(result)  # [14 77] - sum of squares for each row

# Using universal functions
def root_mean_square(arr):
    return np.sqrt(np.mean(arr ** 2))

rms_values = np.apply_along_axis(root_mean_square, axis=0, arr=data)

Performance Tips for Large Arrays

# For very large arrays, consider:
large_data = np.random.rand(10000, 1000)

# 1. Specify dtype for consistent results
result = np.sum(large_data, dtype=np.float64)

# 2. Use axis parameter to reduce memory usage
axis_result = np.mean(large_data, axis=0, keepdims=True)

# 3. Consider chunked processing for memory efficiency
def chunked_mean(arr, chunk_size=1000):
    chunks = [arr[i:i+chunk_size] for i in range(0, len(arr), chunk_size)]
    chunk_means = [np.mean(chunk) for chunk in chunks]
    return np.mean(chunk_means)

Dive Deeper

Master statistical analysis techniques, explore data science workflows, and learn about advanced NumPy operations.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python