Navigation

Python

How to Group Data with groupby() and agg()

Transform raw data into actionable insights with pandas' powerful groupby operations - your gateway to advanced data aggregation.

Table Of Contents

From Data Chaos to Crystal Clarity

Raw data tells scattered stories. GroupBy operations unite those stories into powerful insights, turning overwhelming datasets into clear, actionable intelligence.

Essential GroupBy Operations

import pandas as pd
import numpy as np

# Sample sales data
sales_data = pd.DataFrame({
    'salesperson': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob', 'Alice', 'Charlie'],
    'region': ['North', 'South', 'North', 'East', 'South', 'West', 'East'],
    'product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Laptop', 'Phone', 'Tablet'],
    'quantity': [2, 1, 3, 1, 2, 4, 2],
    'revenue': [2000, 800, 1500, 1000, 2000, 3200, 1000],
    'month': ['Jan', 'Jan', 'Feb', 'Feb', 'Mar', 'Mar', 'Mar']
})

print("Original data:")
print(sales_data)

# Basic groupby operations
print("\n=== Basic GroupBy ===")
# Group by single column
by_salesperson = sales_data.groupby('salesperson')['revenue'].sum()
print("Revenue by salesperson:")
print(by_salesperson)

# Group by multiple columns
by_person_month = sales_data.groupby(['salesperson', 'month'])['revenue'].sum()
print("\nRevenue by salesperson and month:")
print(by_person_month)

Advanced Aggregation with agg()

# Multiple aggregation functions
multi_agg = sales_data.groupby('region').agg({
    'revenue': ['sum', 'mean', 'count'],
    'quantity': ['sum', 'max'],
    'product': lambda x: x.nunique()  # Count unique products
})

print("Multi-function aggregation by region:")
print(multi_agg)

# Custom aggregation functions
def revenue_range(series):
    return series.max() - series.min()

def top_product(series):
    return series.mode().iloc[0] if not series.mode().empty else 'N/A'

custom_agg = sales_data.groupby('salesperson').agg({
    'revenue': [revenue_range, 'std'],
    'product': top_product,
    'quantity': 'median'
})

print("\nCustom aggregation:")
print(custom_agg)

# Named aggregations (cleaner column names)
named_agg = sales_data.groupby('region').agg(
    total_revenue=('revenue', 'sum'),
    avg_revenue=('revenue', 'mean'),
    total_quantity=('quantity', 'sum'),
    unique_products=('product', 'nunique'),
    transactions=('revenue', 'count')
)

print("\nNamed aggregations:")
print(named_agg)

Statistical Analysis with GroupBy

# Comprehensive statistics by group
stats_by_region = sales_data.groupby('region')['revenue'].describe()
print("Revenue statistics by region:")
print(stats_by_region)

# Percentiles and quantiles
percentiles = sales_data.groupby('salesperson')['revenue'].quantile([0.25, 0.5, 0.75])
print("\nRevenue percentiles by salesperson:")
print(percentiles.unstack())

# Standard business metrics
business_metrics = sales_data.groupby('salesperson').agg({
    'revenue': ['sum', 'mean', 'std'],
    'quantity': 'sum'
}).round(2)

# Calculate coefficient of variation (risk metric)
business_metrics['cv'] = (business_metrics[('revenue', 'std')] / 
                         business_metrics[('revenue', 'mean')])

print("\nBusiness metrics with risk assessment:")
print(business_metrics)

Time-Based Grouping

# Create time series data
date_range = pd.date_range('2025-01-01', periods=100, freq='D')
time_sales = pd.DataFrame({
    'date': date_range,
    'sales': np.random.randint(100, 1000, 100),
    'costs': np.random.randint(50, 500, 100),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 100)
})

time_sales['profit'] = time_sales['sales'] - time_sales['costs']
time_sales.set_index('date', inplace=True)

# Monthly aggregation
monthly_summary = time_sales.groupby([
    time_sales.index.to_period('M'),  # Group by month
    'region'
]).agg({
    'sales': 'sum',
    'costs': 'sum', 
    'profit': ['sum', 'mean']
})

print("Monthly summary by region:")
print(monthly_summary.head())

# Weekly rolling aggregation
weekly_avg = time_sales.groupby('region')['profit'].rolling(window=7).mean()
print("\n7-day rolling average profit by region (first 10):")
print(weekly_avg.head(10))

Transform and Apply Operations

# Transform: return same shape as original
sales_with_pct = sales_data.copy()

# Calculate percentage of total revenue by salesperson
sales_with_pct['pct_of_person_total'] = (
    sales_data.groupby('salesperson')['revenue']
    .transform(lambda x: x / x.sum() * 100)
).round(1)

# Calculate z-score within each region
sales_with_pct['revenue_zscore'] = (
    sales_data.groupby('region')['revenue']
    .transform(lambda x: (x - x.mean()) / x.std())
).round(2)

print("Data with transformations:")
print(sales_with_pct[['salesperson', 'region', 'revenue', 'pct_of_person_total', 'revenue_zscore']])

# Apply: custom functions to groups
def analyze_performance(group):
    """Custom analysis function for each group"""
    return pd.Series({
        'total_revenue': group['revenue'].sum(),
        'avg_deal_size': group['revenue'].mean(),
        'consistency': 1 / (group['revenue'].std() / group['revenue'].mean() + 0.01),
        'top_product': group['product'].mode().iloc[0],
        'active_months': group['month'].nunique()
    })

performance_analysis = sales_data.groupby('salesperson').apply(analyze_performance)
print("\nPerformance analysis:")
print(performance_analysis)

Multi-Level Grouping and Unstacking

# Complex multi-level grouping
pivot_analysis = sales_data.groupby(['region', 'product', 'month']).agg({
    'revenue': 'sum',
    'quantity': 'sum'
}).round(0)

print("Multi-level grouping:")
print(pivot_analysis.head(10))

# Unstack for pivot table-like view
revenue_pivot = sales_data.groupby(['region', 'month'])['revenue'].sum().unstack(fill_value=0)
print("\nRevenue by region and month (pivoted):")
print(revenue_pivot)

# Stack and unstack for reshaping
product_analysis = sales_data.groupby(['product', 'salesperson'])['revenue'].sum().unstack(fill_value=0)
print("\nProduct performance by salesperson:")
print(product_analysis)

Advanced Filtering and Selection

# Filter groups based on group properties
high_volume_regions = (
    sales_data
    .groupby('region')
    .filter(lambda x: x['revenue'].sum() > 3000)
    ['region']
    .unique()
)
print(f"High volume regions: {high_volume_regions}")

# Select top N from each group
top_deals_per_person = (
    sales_data
    .groupby('salesperson')
    .apply(lambda x: x.nlargest(2, 'revenue'))
    .reset_index(drop=True)
)
print("\nTop 2 deals per salesperson:")
print(top_deals_per_person[['salesperson', 'revenue', 'product']])

# Cumulative operations within groups
sales_with_cumsum = sales_data.copy()
sales_with_cumsum['cumulative_revenue'] = (
    sales_data
    .groupby('salesperson')['revenue']
    .cumsum()
)

sales_with_cumsum['running_avg'] = (
    sales_data
    .groupby('salesperson')['revenue']
    .expanding()
    .mean()
    .round(1)
)

print("\nCumulative analysis:")
print(sales_with_cumsum[['salesperson', 'revenue', 'cumulative_revenue', 'running_avg']])

Performance Optimization

# For large datasets, optimize groupby operations
large_data = pd.DataFrame({
    'group': np.random.choice(['A', 'B', 'C', 'D'], 1000000),
    'value1': np.random.randn(1000000),
    'value2': np.random.randn(1000000)
})

# Efficient aggregation
efficient_agg = large_data.groupby('group', observed=True).agg({
    'value1': ['mean', 'std'],
    'value2': 'sum'
})

# Use observed=True for categorical data to avoid empty groups
large_data['group'] = large_data['group'].astype('category')
cat_agg = large_data.groupby('group', observed=True)['value1'].mean()

print("Efficient aggregation completed")

Real-World Business Intelligence

# Create comprehensive business dashboard data
def create_business_dashboard(df):
    """Generate key business metrics"""
    
    dashboard = {}
    
    # Revenue metrics
    dashboard['revenue_by_region'] = df.groupby('region')['revenue'].sum().sort_values(ascending=False)
    
    # Performance metrics
    dashboard['top_performers'] = df.groupby('salesperson')['revenue'].sum().nlargest(3)
    
    # Product insights
    dashboard['product_performance'] = df.groupby('product').agg({
        'revenue': ['sum', 'count'],
        'quantity': 'sum'
    })
    
    # Regional diversity
    dashboard['regional_diversity'] = df.groupby('region')['product'].nunique()
    
    # Monthly trends
    dashboard['monthly_trends'] = df.groupby('month')['revenue'].agg(['sum', 'mean', 'count'])
    
    return dashboard

dashboard = create_business_dashboard(sales_data)
print("=== Business Dashboard ===")
for metric, data in dashboard.items():
    print(f"\n{metric.upper()}:")
    print(data)

Master Advanced Analytics

Explore pivot tables and cross-tabulation, learn time series analysis techniques, and discover advanced data visualization methods.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python