Table Of Contents
- Hierarchical Data Made Simple
- Creating Multi-level Indexes
- Navigating Multi-level Indexes
- Advanced Selection Techniques
- Reshaping with Multi-level Indexes
- Groupby Operations with Multi-level Indexes
- Performance Analysis with Multi-level Indexes
- Multi-level Index Manipulation
- Working with Missing Data in Multi-level Indexes
- Time Series with Multi-level Indexes
- Best Practices for Multi-level Indexes
- Master Advanced Data Structures
Hierarchical Data Made Simple
Multi-level indexes unlock powerful data organization patterns that flat structures can't handle. Master hierarchical indexing to transform complex, multi-dimensional datasets into elegant, navigable structures.
Creating Multi-level Indexes
import pandas as pd
import numpy as np
# Method 1: Creating from tuples
multi_index_tuples = [
('USA', 'California', 'Los Angeles'),
('USA', 'California', 'San Francisco'),
('USA', 'Texas', 'Houston'),
('USA', 'Texas', 'Dallas'),
('UK', 'England', 'London'),
('UK', 'England', 'Manchester'),
('UK', 'Scotland', 'Edinburgh')
]
multi_index = pd.MultiIndex.from_tuples(
multi_index_tuples,
names=['Country', 'State/Region', 'City']
)
# Create DataFrame with multi-level index
sales_data = pd.DataFrame({
'Sales': [1200, 800, 950, 750, 1100, 600, 400],
'Population': [4000000, 880000, 2300000, 1300000, 9000000, 550000, 500000]
}, index=multi_index)
print("Multi-level indexed DataFrame:")
print(sales_data)
print(f"\nIndex levels: {sales_data.index.nlevels}")
print(f"Index names: {sales_data.index.names}")
# Method 2: Creating from arrays
countries = ['USA', 'USA', 'USA', 'Canada', 'Canada']
states = ['CA', 'TX', 'NY', 'ON', 'BC']
cities = ['LA', 'Houston', 'NYC', 'Toronto', 'Vancouver']
multi_index2 = pd.MultiIndex.from_arrays(
[countries, states, cities],
names=['Country', 'State', 'City']
)
business_data = pd.DataFrame({
'Revenue': [5000, 4500, 6000, 3500, 3000],
'Employees': [50, 45, 75, 30, 25]
}, index=multi_index2)
print("\nAnother multi-level DataFrame:")
print(business_data)
Navigating Multi-level Indexes
# Accessing data with multi-level indexes
print("\n=== Accessing Multi-level Data ===")
# Access by top level
usa_data = sales_data.loc['USA']
print("USA data:")
print(usa_data)
# Access by multiple levels
california_data = sales_data.loc[('USA', 'California')]
print("\nCalifornia data:")
print(california_data)
# Access specific city
la_data = sales_data.loc[('USA', 'California', 'Los Angeles')]
print(f"\nLos Angeles data:\n{la_data}")
# Slicing with multi-level indexes
uk_data = sales_data.loc['UK']
print("\nUK data:")
print(uk_data)
# Using xs() for cross-section
all_california = sales_data.xs('California', level='State/Region')
print("\nAll California cities (using xs):")
print(all_california)
# Boolean indexing with multi-level
high_sales = sales_data[sales_data['Sales'] > 800]
print("\nCities with high sales:")
print(high_sales)
Advanced Selection Techniques
# Using slicers for complex selection
idx = pd.IndexSlice
# Select multiple levels with slicers
usa_uk_data = sales_data.loc[idx[['USA', 'UK'], :, :], :]
print("USA and UK data:")
print(usa_uk_data)
# Partial slicing
large_cities = sales_data.loc[idx[:, :, ['Los Angeles', 'London']], :]
print("\nLarge cities data:")
print(large_cities)
# Query on multi-level index
query_result = sales_data.query('Sales > 700 and Population > 1000000')
print("\nQuery result (Sales > 700 AND Population > 1M):")
print(query_result)
# Using isin() with multi-level index
target_cities = [('USA', 'California', 'Los Angeles'), ('UK', 'England', 'London')]
selected_cities = sales_data[sales_data.index.isin(target_cities)]
print("\nSelected major cities:")
print(selected_cities)
Reshaping with Multi-level Indexes
# Create sample time series data with multi-level columns
dates = pd.date_range('2024-01-01', periods=5, freq='D')
companies = ['Apple', 'Google', 'Microsoft']
metrics = ['Stock_Price', 'Volume']
# Multi-level column index
column_index = pd.MultiIndex.from_product(
[companies, metrics],
names=['Company', 'Metric']
)
# Create DataFrame
np.random.seed(42)
stock_data = pd.DataFrame(
np.random.randn(5, 6) * 10 + 100,
index=dates,
columns=column_index
)
print("Stock data with multi-level columns:")
print(stock_data.round(2))
# Stack and unstack operations
stacked = stock_data.stack('Company')
print("\nStacked data:")
print(stacked.head())
# Unstack to different level
unstacked = stacked.unstack('Metric')
print("\nUnstacked by metric:")
print(unstacked.head())
# Pivot operations with multi-level
pivoted = stacked.reset_index().pivot_table(
index='Company',
columns=['Metric'],
values=stacked.columns,
aggfunc='mean'
)
print("\nPivoted data:")
print(pivoted.round(2))
Groupby Operations with Multi-level Indexes
# Create comprehensive business dataset
np.random.seed(42)
business_records = []
for country in ['USA', 'UK', 'Germany']:
for region in ['North', 'South']:
for product in ['A', 'B', 'C']:
for quarter in ['Q1', 'Q2', 'Q3', 'Q4']:
record = {
'Country': country,
'Region': region,
'Product': product,
'Quarter': quarter,
'Sales': np.random.randint(100, 1000),
'Profit': np.random.randint(10, 100),
'Units': np.random.randint(10, 50)
}
business_records.append(record)
business_df = pd.DataFrame(business_records)
# Set multi-level index
business_multi = business_df.set_index(['Country', 'Region', 'Product', 'Quarter'])
print("Business data with multi-level index:")
print(business_multi.head(10))
# Groupby operations at different levels
print("\n=== GroupBy Operations ===")
# Group by country
country_summary = business_multi.groupby(level='Country').agg({
'Sales': ['sum', 'mean'],
'Profit': 'sum',
'Units': 'sum'
})
print("Country-level summary:")
print(country_summary)
# Group by multiple levels
country_product = business_multi.groupby(level=['Country', 'Product']).agg({
'Sales': 'sum',
'Profit': 'mean'
})
print("\nCountry-Product summary:")
print(country_product.head())
# Custom aggregations by level
quarterly_analysis = business_multi.groupby(level=['Country', 'Quarter']).apply(
lambda x: pd.Series({
'Total_Sales': x['Sales'].sum(),
'Avg_Profit_Margin': (x['Profit'] / x['Sales']).mean(),
'Product_Diversity': x.index.get_level_values('Product').nunique()
})
)
print("\nQuarterly analysis:")
print(quarterly_analysis.head())
Performance Analysis with Multi-level Indexes
# Performance metrics by hierarchy
def calculate_performance_metrics(data):
"""Calculate comprehensive performance metrics"""
metrics = {}
# Overall performance
metrics['total_sales'] = data['Sales'].sum()
metrics['total_profit'] = data['Profit'].sum()
metrics['profit_margin'] = data['Profit'].sum() / data['Sales'].sum()
metrics['avg_units_per_sale'] = data['Units'].sum() / len(data)
return pd.Series(metrics)
# Apply to different hierarchical levels
print("\n=== Performance Analysis ===")
# Country level performance
country_performance = business_multi.groupby(level='Country').apply(calculate_performance_metrics)
print("Country performance:")
print(country_performance.round(3))
# Regional performance within countries
regional_performance = business_multi.groupby(level=['Country', 'Region']).apply(
calculate_performance_metrics
)
print("\nRegional performance:")
print(regional_performance.round(3))
# Product performance across all markets
product_performance = business_multi.groupby(level='Product').apply(calculate_performance_metrics)
print("\nProduct performance:")
print(product_performance.round(3))
# Seasonal analysis
seasonal_performance = business_multi.groupby(level='Quarter').apply(calculate_performance_metrics)
print("\nSeasonal performance:")
print(seasonal_performance.round(3))
Multi-level Index Manipulation
# Index manipulation operations
print("\n=== Index Manipulation ===")
# Swap index levels
swapped = business_multi.swaplevel('Country', 'Product')
print("After swapping Country and Product levels:")
print(swapped.head())
# Reorder levels
reordered = business_multi.reorder_levels(['Product', 'Country', 'Region', 'Quarter'])
print("\nAfter reordering levels:")
print(reordered.head())
# Sort by index
sorted_multi = business_multi.sort_index()
print("\nSorted multi-level index:")
print(sorted_multi.head())
# Reset index (flatten)
flattened = business_multi.reset_index()
print("\nFlattened to regular DataFrame:")
print(flattened.head())
# Recreate multi-level index
recreated = flattened.set_index(['Country', 'Region', 'Product', 'Quarter'])
print("\nRecreated multi-level index:")
print(recreated.head())
Working with Missing Data in Multi-level Indexes
# Create data with missing combinations
incomplete_data = business_multi.copy()
# Introduce missing data
incomplete_data = incomplete_data.drop([
('USA', 'North', 'A', 'Q1'),
('UK', 'South', 'B', 'Q2'),
('Germany', 'North', 'C', 'Q3')
])
print("Data with missing combinations:")
print(f"Original shape: {business_multi.shape}")
print(f"Incomplete shape: {incomplete_data.shape}")
# Reindex to fill missing combinations
full_index = pd.MultiIndex.from_product([
['USA', 'UK', 'Germany'],
['North', 'South'],
['A', 'B', 'C'],
['Q1', 'Q2', 'Q3', 'Q4']
], names=['Country', 'Region', 'Product', 'Quarter'])
complete_data = incomplete_data.reindex(full_index)
print(f"\nAfter reindexing: {complete_data.shape}")
print(f"Missing values: {complete_data.isnull().sum().sum()}")
# Fill missing values by group
filled_data = complete_data.groupby(level=['Country', 'Product']).apply(
lambda group: group.fillna(group.mean())
)
print("\nAfter filling missing values:")
print(f"Remaining missing: {filled_data.isnull().sum().sum()}")
Time Series with Multi-level Indexes
# Create time series with multi-level structure
dates = pd.date_range('2024-01-01', periods=12, freq='M')
regions = ['North', 'South', 'East', 'West']
# Create multi-level time series
time_regions = pd.MultiIndex.from_product(
[dates, regions],
names=['Date', 'Region']
)
ts_data = pd.DataFrame({
'Temperature': np.random.normal(20, 5, len(time_regions)),
'Rainfall': np.random.exponential(50, len(time_regions)),
'Sales': np.random.randint(1000, 5000, len(time_regions))
}, index=time_regions)
print("Time series with regional breakdown:")
print(ts_data.head(10))
# Time series analysis by region
monthly_by_region = ts_data.groupby(level='Region').resample('Q', level='Date').agg({
'Temperature': 'mean',
'Rainfall': 'sum',
'Sales': 'sum'
})
print("\nQuarterly summary by region:")
print(monthly_by_region.head())
# Rolling calculations within groups
rolling_analysis = ts_data.groupby(level='Region').rolling(3, level='Date').agg({
'Temperature': 'mean',
'Sales': 'sum'
})
print("\n3-month rolling analysis:")
print(rolling_analysis.head(10))
Best Practices for Multi-level Indexes
def multi_index_best_practices():
"""Best practices for working with multi-level indexes"""
practices = {
'Design': [
"Plan your hierarchy logically (general to specific)",
"Use meaningful level names",
"Keep the number of levels manageable (typically 2-4)",
"Consider the most common access patterns"
],
'Performance': [
"Sort your multi-index for better performance",
"Use xs() for cross-sections rather than loc when possible",
"Consider using categorical data for repeated string levels",
"Index only the levels you'll frequently query"
],
'Data Integrity': [
"Handle missing combinations explicitly",
"Validate index completeness after operations",
"Use reindex() to ensure complete hierarchical structure",
"Be careful with operations that might break hierarchy"
],
'Readability': [
"Use clear, descriptive level names",
"Document your index structure",
"Consider flattening for simple operations",
"Use query() method for readable filtering"
]
}
print("=== MULTI-LEVEL INDEX BEST PRACTICES ===")
for category, tips in practices.items():
print(f"\n{category}:")
for tip in tips:
print(f" ✅ {tip}")
multi_index_best_practices()
# Performance comparison
def compare_index_performance():
"""Compare performance of different indexing approaches"""
import time
# Create large dataset
large_data = pd.DataFrame({
'A': np.random.randn(100000),
'B': np.random.choice(['X', 'Y', 'Z'], 100000),
'C': np.random.choice(['P', 'Q'], 100000),
'Value': np.random.randn(100000)
})
# Single index
single_indexed = large_data.set_index('A')
# Multi-index
multi_indexed = large_data.set_index(['B', 'C'])
# Performance test
start = time.time()
result1 = multi_indexed.loc[('X', 'P')]
multi_time = time.time() - start
start = time.time()
result2 = large_data[(large_data['B'] == 'X') & (large_data['C'] == 'P')]
boolean_time = time.time() - start
print(f"\nPerformance comparison:")
print(f"Multi-index access: {multi_time:.6f}s")
print(f"Boolean indexing: {boolean_time:.6f}s")
print(f"Multi-index is {boolean_time/multi_time:.1f}x faster")
compare_index_performance()
Master Advanced Data Structures
Explore advanced pandas indexing patterns, learn hierarchical data modeling, and discover dimensional data analysis techniques.
Share this article
Add Comment
No comments yet. Be the first to comment!