Table Of Contents
- Categories Unlock Hidden Performance
- Understanding Categorical Data
- Creating and Manipulating Categories
- Category Management Operations
- Performance Benefits of Categorical Data
- Ordered Categorical Analysis
- Advanced Categorical Operations
- Missing Values in Categorical Data
- Real-World Business Applications
- Categorical Data Best Practices
- Master Advanced Pandas Features
Categories Unlock Hidden Performance
Repeated string values silently drain memory and slow operations. Pandas' categorical dtype transforms redundant text into lightning-fast, memory-efficient data structures that supercharge your analysis.
Understanding Categorical Data
import pandas as pd
import numpy as np
# Create sample data with repeated string values
customer_data = pd.DataFrame({
'customer_id': range(1, 1001),
'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 1000),
'plan_type': np.random.choice(['Basic', 'Premium', 'Enterprise'], 1000),
'status': np.random.choice(['Active', 'Inactive', 'Suspended'], 1000),
'support_level': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], 1000),
'spend_category': np.random.choice(['Low', 'Medium', 'High'], 1000)
})
print("Original data with string columns:")
print(customer_data.head())
print(f"\nMemory usage: {customer_data.memory_usage(deep=True).sum():,} bytes")
print(f"Data types:\n{customer_data.dtypes}")
# Convert to categorical
categorical_data = customer_data.copy()
string_columns = ['city', 'plan_type', 'status', 'support_level', 'spend_category']
for col in string_columns:
categorical_data[col] = categorical_data[col].astype('category')
print(f"\nAfter converting to categorical:")
print(f"Memory usage: {categorical_data.memory_usage(deep=True).sum():,} bytes")
print(f"Memory saved: {customer_data.memory_usage(deep=True).sum() - categorical_data.memory_usage(deep=True).sum():,} bytes")
print(f"Percentage saved: {((customer_data.memory_usage(deep=True).sum() - categorical_data.memory_usage(deep=True).sum()) / customer_data.memory_usage(deep=True).sum() * 100):.1f}%")
Creating and Manipulating Categories
# Method 1: Converting existing columns
print("\n=== Creating Categorical Data ===")
# Simple conversion
simple_cat = pd.Categorical(['A', 'B', 'A', 'C', 'B', 'A'])
print(f"Simple categorical: {simple_cat}")
print(f"Categories: {simple_cat.categories}")
print(f"Codes: {simple_cat.codes}")
# Creating with specified categories and order
priority_levels = pd.Categorical(
['High', 'Low', 'Medium', 'High', 'Low'],
categories=['Low', 'Medium', 'High'],
ordered=True
)
print(f"\nOrdered categorical: {priority_levels}")
print(f"Is ordered: {priority_levels.ordered}")
# Method 2: Creating DataFrame with categorical dtype
df_with_categories = pd.DataFrame({
'grade': pd.Categorical(['A', 'B', 'C', 'A', 'B'],
categories=['A', 'B', 'C', 'D', 'F'],
ordered=True),
'department': pd.Categorical(['IT', 'HR', 'Finance', 'IT', 'HR'])
})
print(f"\nDataFrame with categorical columns:")
print(df_with_categories)
print(f"Grade categories: {df_with_categories['grade'].cat.categories}")
print(f"Unused categories: {df_with_categories['grade'].cat.categories[~df_with_categories['grade'].cat.categories.isin(df_with_categories['grade'])]}")
Category Management Operations
# Working with categorical accessor (.cat)
print("\n=== Category Management ===")
# Add new categories
categorical_data['plan_type'].cat.add_categories(['VIP'], inplace=True)
print(f"Plan type categories after adding VIP: {categorical_data['plan_type'].cat.categories}")
# Remove unused categories
original_categories = categorical_data['city'].cat.categories
categorical_data['city'] = categorical_data['city'].cat.remove_unused_categories()
print(f"City categories after removing unused: {categorical_data['city'].cat.categories}")
# Rename categories
categorical_data['status'] = categorical_data['status'].cat.rename_categories({
'Active': 'ACTIVE',
'Inactive': 'INACTIVE',
'Suspended': 'SUSPENDED'
})
print(f"Renamed status categories: {categorical_data['status'].cat.categories}")
# Reorder categories
categorical_data['support_level'] = categorical_data['support_level'].cat.reorder_categories(
['Bronze', 'Silver', 'Gold', 'Platinum'],
ordered=True
)
print(f"Support level (ordered): {categorical_data['support_level'].cat.categories}")
print(f"Is ordered: {categorical_data['support_level'].cat.ordered}")
# Set new categories
new_spend_categories = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
categorical_data['spend_category'] = categorical_data['spend_category'].cat.set_categories(
new_spend_categories,
ordered=True
)
print(f"New spend categories: {categorical_data['spend_category'].cat.categories}")
Performance Benefits of Categorical Data
import time
# Performance comparison: categorical vs string operations
print("\n=== Performance Comparison ===")
# Create larger dataset for meaningful performance test
large_string_data = pd.DataFrame({
'category': np.random.choice(['Category_A', 'Category_B', 'Category_C', 'Category_D'], 100000),
'value': np.random.randn(100000)
})
large_categorical_data = large_string_data.copy()
large_categorical_data['category'] = large_categorical_data['category'].astype('category')
# Test 1: Groupby operations
print("Groupby operation performance:")
start_time = time.time()
string_grouped = large_string_data.groupby('category')['value'].mean()
string_time = time.time() - start_time
start_time = time.time()
categorical_grouped = large_categorical_data.groupby('category')['value'].mean()
categorical_time = time.time() - start_time
print(f"String groupby: {string_time:.6f}s")
print(f"Categorical groupby: {categorical_time:.6f}s")
print(f"Categorical is {string_time/categorical_time:.2f}x faster")
# Test 2: Value counts
print("\nValue counts performance:")
start_time = time.time()
string_counts = large_string_data['category'].value_counts()
string_count_time = time.time() - start_time
start_time = time.time()
categorical_counts = large_categorical_data['category'].value_counts()
categorical_count_time = time.time() - start_time
print(f"String value_counts: {string_count_time:.6f}s")
print(f"Categorical value_counts: {categorical_count_time:.6f}s")
print(f"Categorical is {string_count_time/categorical_count_time:.2f}x faster")
Ordered Categorical Analysis
# Working with ordered categories
print("\n=== Ordered Categorical Analysis ===")
# Create ordered categorical for analysis
satisfaction_data = pd.DataFrame({
'customer_id': range(1, 1001),
'satisfaction': pd.Categorical(
np.random.choice(['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied'], 1000),
categories=['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied'],
ordered=True
),
'purchase_amount': np.random.uniform(10, 1000, 1000)
})
print("Satisfaction data:")
print(satisfaction_data['satisfaction'].value_counts().sort_index())
# Ordered categorical enables meaningful comparisons
high_satisfaction = satisfaction_data[satisfaction_data['satisfaction'] >= 'Satisfied']
print(f"\nCustomers with high satisfaction: {len(high_satisfaction)}")
low_satisfaction = satisfaction_data[satisfaction_data['satisfaction'] <= 'Dissatisfied']
print(f"Customers with low satisfaction: {len(low_satisfaction)}")
# Statistical analysis with ordered categories
satisfaction_summary = satisfaction_data.groupby('satisfaction')['purchase_amount'].agg(['count', 'mean', 'std'])
print("\nPurchase analysis by satisfaction level:")
print(satisfaction_summary.round(2))
# Correlation with ordered categorical (using codes)
correlation = satisfaction_data['satisfaction'].cat.codes.corr(satisfaction_data['purchase_amount'])
print(f"\nCorrelation between satisfaction and purchase amount: {correlation:.3f}")
Advanced Categorical Operations
# Combining and manipulating categorical data
print("\n=== Advanced Categorical Operations ===")
# Create sample datasets for combining
df1 = pd.DataFrame({
'grade': pd.Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C', 'D', 'F']),
'score': [95, 85, 75]
})
df2 = pd.DataFrame({
'grade': pd.Categorical(['B', 'D', 'A'], categories=['A', 'B', 'C', 'D']), # Different categories
'score': [88, 65, 92]
})
print("DataFrame 1 categories:", df1['grade'].cat.categories)
print("DataFrame 2 categories:", df2['grade'].cat.categories)
# Concatenating with different categories
try:
combined = pd.concat([df1, df2], ignore_index=True)
print("\nCombined DataFrame:")
print(combined)
print("Combined categories:", combined['grade'].cat.categories)
except Exception as e:
print(f"Error combining: {e}")
# Union categories for safe concatenation
union_categories = df1['grade'].cat.categories.union(df2['grade'].cat.categories)
df1_aligned = df1.copy()
df2_aligned = df2.copy()
df1_aligned['grade'] = df1_aligned['grade'].cat.set_categories(union_categories)
df2_aligned['grade'] = df2_aligned['grade'].cat.set_categories(union_categories)
combined_safe = pd.concat([df1_aligned, df2_aligned], ignore_index=True)
print("\nSafely combined DataFrame:")
print(combined_safe)
# Mapping categories
grade_mapping = {'A': 'Excellent', 'B': 'Good', 'C': 'Average', 'D': 'Poor', 'F': 'Fail'}
combined_safe['grade_description'] = combined_safe['grade'].map(grade_mapping)
print("\nWith grade descriptions:")
print(combined_safe)
Missing Values in Categorical Data
# Handling missing values with categorical data
print("\n=== Missing Values in Categorical Data ===")
# Create data with missing values
data_with_missing = pd.DataFrame({
'category': pd.Categorical(['A', 'B', None, 'A', 'C', None, 'B']),
'value': [1, 2, 3, 4, 5, 6, 7]
})
print("Data with missing categorical values:")
print(data_with_missing)
print(f"Missing values: {data_with_missing['category'].isnull().sum()}")
# Add 'Unknown' category for missing values
data_with_missing['category'] = data_with_missing['category'].cat.add_categories(['Unknown'])
data_with_missing['category'] = data_with_missing['category'].fillna('Unknown')
print("\nAfter filling missing values:")
print(data_with_missing)
print("Value counts:")
print(data_with_missing['category'].value_counts())
# Remove 'Unknown' category if needed
data_cleaned = data_with_missing[data_with_missing['category'] != 'Unknown']
data_cleaned['category'] = data_cleaned['category'].cat.remove_unused_categories()
print(f"\nAfter removing 'Unknown': {len(data_cleaned)} rows")
print("Remaining categories:", data_cleaned['category'].cat.categories)
Real-World Business Applications
# E-commerce categorical analysis
print("\n=== Business Application: E-commerce Analysis ===")
# Create realistic e-commerce data
ecommerce_data = pd.DataFrame({
'customer_id': range(1, 5001),
'customer_segment': pd.Categorical(
np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], 5000, p=[0.4, 0.3, 0.2, 0.1]),
categories=['Bronze', 'Silver', 'Gold', 'Platinum'],
ordered=True
),
'product_category': pd.Categorical(
np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 5000)
),
'purchase_channel': pd.Categorical(
np.random.choice(['Online', 'Mobile', 'Store'], 5000)
),
'satisfaction_rating': pd.Categorical(
np.random.choice(['1-Poor', '2-Fair', '3-Good', '4-Very Good', '5-Excellent'], 5000),
categories=['1-Poor', '2-Fair', '3-Good', '4-Very Good', '5-Excellent'],
ordered=True
),
'purchase_amount': np.random.uniform(10, 2000, 5000)
})
print("E-commerce data sample:")
print(ecommerce_data.head())
# Business analysis with categorical data
print("\n1. Customer Segment Analysis:")
segment_analysis = ecommerce_data.groupby('customer_segment')['purchase_amount'].agg(['count', 'mean', 'sum'])
print(segment_analysis.round(2))
print("\n2. Cross-tabulation: Segment vs Channel:")
segment_channel = pd.crosstab(ecommerce_data['customer_segment'],
ecommerce_data['purchase_channel'],
normalize='index') * 100
print(segment_channel.round(1))
print("\n3. Satisfaction by Segment:")
satisfaction_by_segment = pd.crosstab(ecommerce_data['customer_segment'],
ecommerce_data['satisfaction_rating'],
normalize='index') * 100
print(satisfaction_by_segment.round(1))
# Advanced analysis: Segment upgrade potential
print("\n4. Segment Upgrade Analysis:")
def analyze_upgrade_potential(group):
"""Analyze potential for customer segment upgrade"""
high_spenders = group[group['purchase_amount'] > group['purchase_amount'].quantile(0.8)]
high_satisfaction = group[group['satisfaction_rating'] >= '4-Very Good']
return pd.Series({
'total_customers': len(group),
'high_spenders_pct': len(high_spenders) / len(group) * 100,
'high_satisfaction_pct': len(high_satisfaction) / len(group) * 100,
'upgrade_candidates': len(high_spenders[high_spenders.index.isin(high_satisfaction.index)]),
'avg_purchase': group['purchase_amount'].mean()
})
upgrade_analysis = ecommerce_data.groupby('customer_segment').apply(analyze_upgrade_potential)
print(upgrade_analysis.round(2))
Categorical Data Best Practices
def categorical_best_practices():
"""Best practices for working with categorical data"""
practices = {
'When to Use Categorical': [
"String columns with repeated values (< 50% unique)",
"Ordered data (ratings, sizes, priorities)",
"Fixed set of possible values (status, categories)",
"Large datasets where memory matters",
"Frequent groupby operations"
],
'Performance Optimization': [
"Convert to categorical before large operations",
"Use ordered=True for ordinal data",
"Remove unused categories periodically",
"Consider categorical for string columns > 1MB",
"Use category codes for numerical operations"
],
'Data Integrity': [
"Define all possible categories upfront when known",
"Handle missing values explicitly",
"Validate category consistency across datasets",
"Document category meanings and order",
"Use union categories when combining datasets"
],
'Common Pitfalls': [
"Don't use categorical for high-cardinality data",
"Remember to align categories when concatenating",
"Ordered comparisons only work with ordered=True",
"Category changes don't automatically update existing data",
"Be careful with category codes - they can change"
]
}
print("=== CATEGORICAL DATA BEST PRACTICES ===")
for category, tips in practices.items():
print(f"\n{category}:")
for tip in tips:
print(f" ✅ {tip}")
categorical_best_practices()
# Utility function for automatic categorical conversion
def auto_categorize(df, threshold=0.5, min_size_mb=1):
"""Automatically convert suitable columns to categorical"""
df_optimized = df.copy()
conversions = []
for col in df_optimized.select_dtypes(include=['object']).columns:
col_size_mb = df_optimized[col].memory_usage(deep=True) / 1024 / 1024
uniqueness_ratio = df_optimized[col].nunique() / len(df_optimized)
if col_size_mb >= min_size_mb and uniqueness_ratio <= threshold:
memory_before = df_optimized[col].memory_usage(deep=True)
df_optimized[col] = df_optimized[col].astype('category')
memory_after = df_optimized[col].memory_usage(deep=True)
conversions.append({
'column': col,
'uniqueness_ratio': uniqueness_ratio,
'memory_saved_mb': (memory_before - memory_after) / 1024 / 1024,
'categories_count': df_optimized[col].cat.categories.nunique()
})
return df_optimized, conversions
# Test auto-categorization
optimized_data, conversion_results = auto_categorize(customer_data)
print(f"\nAuto-categorization results:")
if conversion_results:
conversion_df = pd.DataFrame(conversion_results)
print(conversion_df.round(3))
else:
print("No columns suitable for automatic categorization")
Master Advanced Pandas Features
Explore advanced data type optimization, learn memory-efficient data processing, and discover high-performance pandas patterns.
Share this article
Add Comment
No comments yet. Be the first to comment!