Navigation

Python

How to Handle Categorical Data in Pandas

Transform string-heavy datasets into memory-efficient, high-performance categorical data structures with pandas' powerful categorical dtype

Table Of Contents

Categories Unlock Hidden Performance

Repeated string values silently drain memory and slow operations. Pandas' categorical dtype transforms redundant text into lightning-fast, memory-efficient data structures that supercharge your analysis.

Understanding Categorical Data

import pandas as pd
import numpy as np

# Create sample data with repeated string values
customer_data = pd.DataFrame({
    'customer_id': range(1, 1001),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 1000),
    'plan_type': np.random.choice(['Basic', 'Premium', 'Enterprise'], 1000),
    'status': np.random.choice(['Active', 'Inactive', 'Suspended'], 1000),
    'support_level': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], 1000),
    'spend_category': np.random.choice(['Low', 'Medium', 'High'], 1000)
})

print("Original data with string columns:")
print(customer_data.head())
print(f"\nMemory usage: {customer_data.memory_usage(deep=True).sum():,} bytes")
print(f"Data types:\n{customer_data.dtypes}")

# Convert to categorical
categorical_data = customer_data.copy()
string_columns = ['city', 'plan_type', 'status', 'support_level', 'spend_category']

for col in string_columns:
    categorical_data[col] = categorical_data[col].astype('category')

print(f"\nAfter converting to categorical:")
print(f"Memory usage: {categorical_data.memory_usage(deep=True).sum():,} bytes")
print(f"Memory saved: {customer_data.memory_usage(deep=True).sum() - categorical_data.memory_usage(deep=True).sum():,} bytes")
print(f"Percentage saved: {((customer_data.memory_usage(deep=True).sum() - categorical_data.memory_usage(deep=True).sum()) / customer_data.memory_usage(deep=True).sum() * 100):.1f}%")

Creating and Manipulating Categories

# Method 1: Converting existing columns
print("\n=== Creating Categorical Data ===")

# Simple conversion
simple_cat = pd.Categorical(['A', 'B', 'A', 'C', 'B', 'A'])
print(f"Simple categorical: {simple_cat}")
print(f"Categories: {simple_cat.categories}")
print(f"Codes: {simple_cat.codes}")

# Creating with specified categories and order
priority_levels = pd.Categorical(
    ['High', 'Low', 'Medium', 'High', 'Low'],
    categories=['Low', 'Medium', 'High'],
    ordered=True
)
print(f"\nOrdered categorical: {priority_levels}")
print(f"Is ordered: {priority_levels.ordered}")

# Method 2: Creating DataFrame with categorical dtype
df_with_categories = pd.DataFrame({
    'grade': pd.Categorical(['A', 'B', 'C', 'A', 'B'], 
                           categories=['A', 'B', 'C', 'D', 'F'], 
                           ordered=True),
    'department': pd.Categorical(['IT', 'HR', 'Finance', 'IT', 'HR'])
})

print(f"\nDataFrame with categorical columns:")
print(df_with_categories)
print(f"Grade categories: {df_with_categories['grade'].cat.categories}")
print(f"Unused categories: {df_with_categories['grade'].cat.categories[~df_with_categories['grade'].cat.categories.isin(df_with_categories['grade'])]}")

Category Management Operations

# Working with categorical accessor (.cat)
print("\n=== Category Management ===")

# Add new categories
categorical_data['plan_type'].cat.add_categories(['VIP'], inplace=True)
print(f"Plan type categories after adding VIP: {categorical_data['plan_type'].cat.categories}")

# Remove unused categories
original_categories = categorical_data['city'].cat.categories
categorical_data['city'] = categorical_data['city'].cat.remove_unused_categories()
print(f"City categories after removing unused: {categorical_data['city'].cat.categories}")

# Rename categories
categorical_data['status'] = categorical_data['status'].cat.rename_categories({
    'Active': 'ACTIVE',
    'Inactive': 'INACTIVE', 
    'Suspended': 'SUSPENDED'
})
print(f"Renamed status categories: {categorical_data['status'].cat.categories}")

# Reorder categories
categorical_data['support_level'] = categorical_data['support_level'].cat.reorder_categories(
    ['Bronze', 'Silver', 'Gold', 'Platinum'], 
    ordered=True
)
print(f"Support level (ordered): {categorical_data['support_level'].cat.categories}")
print(f"Is ordered: {categorical_data['support_level'].cat.ordered}")

# Set new categories
new_spend_categories = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
categorical_data['spend_category'] = categorical_data['spend_category'].cat.set_categories(
    new_spend_categories, 
    ordered=True
)
print(f"New spend categories: {categorical_data['spend_category'].cat.categories}")

Performance Benefits of Categorical Data

import time

# Performance comparison: categorical vs string operations
print("\n=== Performance Comparison ===")

# Create larger dataset for meaningful performance test
large_string_data = pd.DataFrame({
    'category': np.random.choice(['Category_A', 'Category_B', 'Category_C', 'Category_D'], 100000),
    'value': np.random.randn(100000)
})

large_categorical_data = large_string_data.copy()
large_categorical_data['category'] = large_categorical_data['category'].astype('category')

# Test 1: Groupby operations
print("Groupby operation performance:")

start_time = time.time()
string_grouped = large_string_data.groupby('category')['value'].mean()
string_time = time.time() - start_time

start_time = time.time()
categorical_grouped = large_categorical_data.groupby('category')['value'].mean()
categorical_time = time.time() - start_time

print(f"String groupby: {string_time:.6f}s")
print(f"Categorical groupby: {categorical_time:.6f}s")
print(f"Categorical is {string_time/categorical_time:.2f}x faster")

# Test 2: Value counts
print("\nValue counts performance:")

start_time = time.time()
string_counts = large_string_data['category'].value_counts()
string_count_time = time.time() - start_time

start_time = time.time()
categorical_counts = large_categorical_data['category'].value_counts()
categorical_count_time = time.time() - start_time

print(f"String value_counts: {string_count_time:.6f}s")
print(f"Categorical value_counts: {categorical_count_time:.6f}s")
print(f"Categorical is {string_count_time/categorical_count_time:.2f}x faster")

Ordered Categorical Analysis

# Working with ordered categories
print("\n=== Ordered Categorical Analysis ===")

# Create ordered categorical for analysis
satisfaction_data = pd.DataFrame({
    'customer_id': range(1, 1001),
    'satisfaction': pd.Categorical(
        np.random.choice(['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied'], 1000),
        categories=['Very Dissatisfied', 'Dissatisfied', 'Neutral', 'Satisfied', 'Very Satisfied'],
        ordered=True
    ),
    'purchase_amount': np.random.uniform(10, 1000, 1000)
})

print("Satisfaction data:")
print(satisfaction_data['satisfaction'].value_counts().sort_index())

# Ordered categorical enables meaningful comparisons
high_satisfaction = satisfaction_data[satisfaction_data['satisfaction'] >= 'Satisfied']
print(f"\nCustomers with high satisfaction: {len(high_satisfaction)}")

low_satisfaction = satisfaction_data[satisfaction_data['satisfaction'] <= 'Dissatisfied']  
print(f"Customers with low satisfaction: {len(low_satisfaction)}")

# Statistical analysis with ordered categories
satisfaction_summary = satisfaction_data.groupby('satisfaction')['purchase_amount'].agg(['count', 'mean', 'std'])
print("\nPurchase analysis by satisfaction level:")
print(satisfaction_summary.round(2))

# Correlation with ordered categorical (using codes)
correlation = satisfaction_data['satisfaction'].cat.codes.corr(satisfaction_data['purchase_amount'])
print(f"\nCorrelation between satisfaction and purchase amount: {correlation:.3f}")

Advanced Categorical Operations

# Combining and manipulating categorical data
print("\n=== Advanced Categorical Operations ===")

# Create sample datasets for combining
df1 = pd.DataFrame({
    'grade': pd.Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C', 'D', 'F']),
    'score': [95, 85, 75]
})

df2 = pd.DataFrame({
    'grade': pd.Categorical(['B', 'D', 'A'], categories=['A', 'B', 'C', 'D']),  # Different categories
    'score': [88, 65, 92]
})

print("DataFrame 1 categories:", df1['grade'].cat.categories)
print("DataFrame 2 categories:", df2['grade'].cat.categories)

# Concatenating with different categories
try:
    combined = pd.concat([df1, df2], ignore_index=True)
    print("\nCombined DataFrame:")
    print(combined)
    print("Combined categories:", combined['grade'].cat.categories)
except Exception as e:
    print(f"Error combining: {e}")

# Union categories for safe concatenation
union_categories = df1['grade'].cat.categories.union(df2['grade'].cat.categories)
df1_aligned = df1.copy()
df2_aligned = df2.copy()
df1_aligned['grade'] = df1_aligned['grade'].cat.set_categories(union_categories)
df2_aligned['grade'] = df2_aligned['grade'].cat.set_categories(union_categories)

combined_safe = pd.concat([df1_aligned, df2_aligned], ignore_index=True)
print("\nSafely combined DataFrame:")
print(combined_safe)

# Mapping categories
grade_mapping = {'A': 'Excellent', 'B': 'Good', 'C': 'Average', 'D': 'Poor', 'F': 'Fail'}
combined_safe['grade_description'] = combined_safe['grade'].map(grade_mapping)
print("\nWith grade descriptions:")
print(combined_safe)

Missing Values in Categorical Data

# Handling missing values with categorical data
print("\n=== Missing Values in Categorical Data ===")

# Create data with missing values
data_with_missing = pd.DataFrame({
    'category': pd.Categorical(['A', 'B', None, 'A', 'C', None, 'B']),
    'value': [1, 2, 3, 4, 5, 6, 7]
})

print("Data with missing categorical values:")
print(data_with_missing)
print(f"Missing values: {data_with_missing['category'].isnull().sum()}")

# Add 'Unknown' category for missing values
data_with_missing['category'] = data_with_missing['category'].cat.add_categories(['Unknown'])
data_with_missing['category'] = data_with_missing['category'].fillna('Unknown')

print("\nAfter filling missing values:")
print(data_with_missing)
print("Value counts:")
print(data_with_missing['category'].value_counts())

# Remove 'Unknown' category if needed
data_cleaned = data_with_missing[data_with_missing['category'] != 'Unknown']
data_cleaned['category'] = data_cleaned['category'].cat.remove_unused_categories()

print(f"\nAfter removing 'Unknown': {len(data_cleaned)} rows")
print("Remaining categories:", data_cleaned['category'].cat.categories)

Real-World Business Applications

# E-commerce categorical analysis
print("\n=== Business Application: E-commerce Analysis ===")

# Create realistic e-commerce data
ecommerce_data = pd.DataFrame({
    'customer_id': range(1, 5001),
    'customer_segment': pd.Categorical(
        np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], 5000, p=[0.4, 0.3, 0.2, 0.1]),
        categories=['Bronze', 'Silver', 'Gold', 'Platinum'],
        ordered=True
    ),
    'product_category': pd.Categorical(
        np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 5000)
    ),
    'purchase_channel': pd.Categorical(
        np.random.choice(['Online', 'Mobile', 'Store'], 5000)
    ),
    'satisfaction_rating': pd.Categorical(
        np.random.choice(['1-Poor', '2-Fair', '3-Good', '4-Very Good', '5-Excellent'], 5000),
        categories=['1-Poor', '2-Fair', '3-Good', '4-Very Good', '5-Excellent'],
        ordered=True
    ),
    'purchase_amount': np.random.uniform(10, 2000, 5000)
})

print("E-commerce data sample:")
print(ecommerce_data.head())

# Business analysis with categorical data
print("\n1. Customer Segment Analysis:")
segment_analysis = ecommerce_data.groupby('customer_segment')['purchase_amount'].agg(['count', 'mean', 'sum'])
print(segment_analysis.round(2))

print("\n2. Cross-tabulation: Segment vs Channel:")
segment_channel = pd.crosstab(ecommerce_data['customer_segment'], 
                             ecommerce_data['purchase_channel'], 
                             normalize='index') * 100
print(segment_channel.round(1))

print("\n3. Satisfaction by Segment:")
satisfaction_by_segment = pd.crosstab(ecommerce_data['customer_segment'],
                                     ecommerce_data['satisfaction_rating'],
                                     normalize='index') * 100
print(satisfaction_by_segment.round(1))

# Advanced analysis: Segment upgrade potential
print("\n4. Segment Upgrade Analysis:")
def analyze_upgrade_potential(group):
    """Analyze potential for customer segment upgrade"""
    high_spenders = group[group['purchase_amount'] > group['purchase_amount'].quantile(0.8)]
    high_satisfaction = group[group['satisfaction_rating'] >= '4-Very Good']
    
    return pd.Series({
        'total_customers': len(group),
        'high_spenders_pct': len(high_spenders) / len(group) * 100,
        'high_satisfaction_pct': len(high_satisfaction) / len(group) * 100,
        'upgrade_candidates': len(high_spenders[high_spenders.index.isin(high_satisfaction.index)]),
        'avg_purchase': group['purchase_amount'].mean()
    })

upgrade_analysis = ecommerce_data.groupby('customer_segment').apply(analyze_upgrade_potential)
print(upgrade_analysis.round(2))

Categorical Data Best Practices

def categorical_best_practices():
    """Best practices for working with categorical data"""
    
    practices = {
        'When to Use Categorical': [
            "String columns with repeated values (< 50% unique)",
            "Ordered data (ratings, sizes, priorities)",
            "Fixed set of possible values (status, categories)",
            "Large datasets where memory matters",
            "Frequent groupby operations"
        ],
        
        'Performance Optimization': [
            "Convert to categorical before large operations",
            "Use ordered=True for ordinal data",
            "Remove unused categories periodically",
            "Consider categorical for string columns > 1MB",
            "Use category codes for numerical operations"
        ],
        
        'Data Integrity': [
            "Define all possible categories upfront when known",
            "Handle missing values explicitly",
            "Validate category consistency across datasets",
            "Document category meanings and order",
            "Use union categories when combining datasets"
        ],
        
        'Common Pitfalls': [
            "Don't use categorical for high-cardinality data",
            "Remember to align categories when concatenating",
            "Ordered comparisons only work with ordered=True",
            "Category changes don't automatically update existing data",
            "Be careful with category codes - they can change"
        ]
    }
    
    print("=== CATEGORICAL DATA BEST PRACTICES ===")
    for category, tips in practices.items():
        print(f"\n{category}:")
        for tip in tips:
            print(f"  ✅ {tip}")

categorical_best_practices()

# Utility function for automatic categorical conversion
def auto_categorize(df, threshold=0.5, min_size_mb=1):
    """Automatically convert suitable columns to categorical"""
    
    df_optimized = df.copy()
    conversions = []
    
    for col in df_optimized.select_dtypes(include=['object']).columns:
        col_size_mb = df_optimized[col].memory_usage(deep=True) / 1024 / 1024
        uniqueness_ratio = df_optimized[col].nunique() / len(df_optimized)
        
        if col_size_mb >= min_size_mb and uniqueness_ratio <= threshold:
            memory_before = df_optimized[col].memory_usage(deep=True)
            df_optimized[col] = df_optimized[col].astype('category')
            memory_after = df_optimized[col].memory_usage(deep=True)
            
            conversions.append({
                'column': col,
                'uniqueness_ratio': uniqueness_ratio,
                'memory_saved_mb': (memory_before - memory_after) / 1024 / 1024,
                'categories_count': df_optimized[col].cat.categories.nunique()
            })
    
    return df_optimized, conversions

# Test auto-categorization
optimized_data, conversion_results = auto_categorize(customer_data)

print(f"\nAuto-categorization results:")
if conversion_results:
    conversion_df = pd.DataFrame(conversion_results)
    print(conversion_df.round(3))
else:
    print("No columns suitable for automatic categorization")

Master Advanced Pandas Features

Explore advanced data type optimization, learn memory-efficient data processing, and discover high-performance pandas patterns.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python