How to Use NumPy Aggregation Functions

From Arrays to Insights
Essential Aggregation Functions
Axis-Specific Aggregations
Advanced Aggregation Patterns
Handling Missing Data in Aggregations
Custom Aggregation Functions
Performance Tips for Large Arrays
Dive Deeper

From Arrays to Insights

Raw data tells a story, but aggregation functions help you hear it. NumPy's aggregation arsenal turns millions of numbers into meaningful statistics in microseconds.

Essential Aggregation Functions

import numpy as np

# Sample data
data = np.random.rand(1000, 50)
sales_data = np.array([[100, 150, 200], 
                       [120, 180, 220], 
                       [90, 160, 190]])

# Basic aggregations
print(f"Sum: {np.sum(data)}")
print(f"Mean: {np.mean(data)}")
print(f"Median: {np.median(data)}")
print(f"Standard deviation: {np.std(data)}")
print(f"Variance: {np.var(data)}")

# Min/Max operations
print(f"Minimum: {np.min(data)}")
print(f"Maximum: {np.max(data)}")
print(f"Range: {np.ptp(data)}")  # Peak-to-peak (max - min)

# Position-based functions
print(f"Index of minimum: {np.argmin(data)}")
print(f"Index of maximum: {np.argmax(data)}")

# Percentiles and quantiles
print(f"25th percentile: {np.percentile(data, 25)}")
print(f"75th percentile: {np.percentile(data, 75)}")
print(f"Quartiles: {np.quantile(data, [0.25, 0.5, 0.75])}")

Axis-Specific Aggregations

# 2D aggregations along different axes
matrix = np.array([[1, 2, 3], 
                   [4, 5, 6], 
                   [7, 8, 9]])

# Aggregate along rows (axis=0)
col_sums = np.sum(matrix, axis=0)
print(col_sums)  # [12 15 18] - sum of each column

# Aggregate along columns (axis=1)
row_sums = np.sum(matrix, axis=1)
print(row_sums)  # [ 6 15 24] - sum of each row

# Multi-dimensional example
data_3d = np.random.rand(10, 20, 30)
# Mean across first dimension
result = np.mean(data_3d, axis=0)  # Shape: (20, 30)

# Multiple axes
overall_mean = np.mean(data_3d, axis=(0, 2))  # Shape: (20,)

Advanced Aggregation Patterns

# Conditional aggregations
data = np.array([1, -2, 3, -4, 5, -6])

# Count positive values
positive_count = np.sum(data > 0)
print(f"Positive values: {positive_count}")

# Average of positive values only
positive_avg = np.mean(data[data > 0])
print(f"Positive average: {positive_avg}")

# Weighted aggregations
values = np.array([1, 2, 3, 4, 5])
weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1])
weighted_avg = np.average(values, weights=weights)
print(f"Weighted average: {weighted_avg}")

# Cumulative operations
cumulative_sum = np.cumsum(values)
cumulative_product = np.cumprod(values)
print(f"Cumulative sum: {cumulative_sum}")      # [ 1  3  6 10 15]
print(f"Cumulative product: {cumulative_product}")  # [  1   2   6  24 120]

Handling Missing Data in Aggregations

# Data with NaN values
messy_data = np.array([1.0, 2.0, np.nan, 4.0, np.nan, 6.0])

# Regular functions return NaN
print(f"Regular mean: {np.mean(messy_data)}")    # nan

# NaN-aware functions ignore missing values
print(f"NaN-safe mean: {np.nanmean(messy_data)}")    # 3.25
print(f"NaN-safe std: {np.nanstd(messy_data)}")      # 2.06...
print(f"NaN-safe sum: {np.nansum(messy_data)}")      # 13.0

Custom Aggregation Functions

# Using reduce for custom aggregations
def custom_aggregation(arr):
    """Calculate sum of squares"""
    return np.sum(arr ** 2)

# Apply to different axes
data = np.array([[1, 2, 3], [4, 5, 6]])
result = np.apply_along_axis(custom_aggregation, axis=1, arr=data)
print(result)  # [14 77] - sum of squares for each row

# Using universal functions
def root_mean_square(arr):
    return np.sqrt(np.mean(arr ** 2))

rms_values = np.apply_along_axis(root_mean_square, axis=0, arr=data)

Performance Tips for Large Arrays

# For very large arrays, consider:
large_data = np.random.rand(10000, 1000)

# 1. Specify dtype for consistent results
result = np.sum(large_data, dtype=np.float64)

# 2. Use axis parameter to reduce memory usage
axis_result = np.mean(large_data, axis=0, keepdims=True)

# 3. Consider chunked processing for memory efficiency
def chunked_mean(arr, chunk_size=1000):
    chunks = [arr[i:i+chunk_size] for i in range(0, len(arr), chunk_size)]
    chunk_means = [np.mean(chunk) for chunk in chunks]
    return np.mean(chunk_means)

Dive Deeper

Master statistical analysis techniques, explore data science workflows, and learn about advanced NumPy operations.

Share this article

Navigation

How to Use NumPy Aggregation Functions

Table Of Contents

From Arrays to Insights

Essential Aggregation Functions

Axis-Specific Aggregations

Advanced Aggregation Patterns

Handling Missing Data in Aggregations

Custom Aggregation Functions

Performance Tips for Large Arrays

Dive Deeper

Add Comment

More from Python

Navigation

Table Of Contents

From Arrays to Insights

Essential Aggregation Functions

Axis-Specific Aggregations

Advanced Aggregation Patterns

Handling Missing Data in Aggregations

Custom Aggregation Functions

Performance Tips for Large Arrays

Dive Deeper

Comments

Add Comment

More from Python

Python typing.Callable: Complete Guide to Function Type Hints

How to Handle Multi-level Indexes

How to Use Python's __name__ == "__main__"

How to Use Cross-Validation for Model Evaluation

How to Use Context Managers with "with" Statement

How to Use super() in Python Inheritance

How to Use Python's name == "main"