Navigation

Python

How to Convert Data Types in Pandas

Transform your data with precision using pandas' comprehensive type conversion toolkit - from strings to numbers, dates to categories.

Table Of Contents

Type Precision Drives Performance

Wrong data types silently sabotage analysis and waste memory. Master pandas' type conversion functions to optimize both performance and accuracy in your data workflows.

Understanding Current Data Types

import pandas as pd
import numpy as np

# Sample data with mixed types
mixed_data = pd.DataFrame({
    'user_id': ['001', '002', '003', '004', '005'],
    'age': ['25', '30', '35', 'unknown', '28'],
    'salary': ['50000.50', '60000', '75000.25', '55000', '68000.75'],
    'join_date': ['2023-01-15', '2022-06-20', '2024-03-10', '2023-08-05', '2024-01-20'],
    'is_active': ['True', 'False', 'True', 'True', 'False'],
    'department': ['IT', 'HR', 'IT', 'Finance', 'HR']
})

print("Original data types:")
print(mixed_data.dtypes)
print("\nData preview:")
print(mixed_data)

# Memory usage before conversion
print(f"\nMemory usage before conversion: {mixed_data.memory_usage(deep=True).sum()} bytes")

Essential Type Conversions

# Convert to numeric types
df_converted = mixed_data.copy()

# Convert user_id to integer
df_converted['user_id'] = pd.to_numeric(df_converted['user_id'])

# Convert salary to float, handling potential errors
df_converted['salary'] = pd.to_numeric(df_converted['salary'], errors='coerce')

# Convert age with error handling (replace 'unknown' with NaN)
df_converted['age'] = pd.to_numeric(df_converted['age'], errors='coerce')

print("After numeric conversions:")
print(df_converted.dtypes)
print(df_converted)

# Convert boolean column
df_converted['is_active'] = df_converted['is_active'].astype('bool')

# Convert to datetime
df_converted['join_date'] = pd.to_datetime(df_converted['join_date'])

print("\nAfter all basic conversions:")
print(df_converted.dtypes)
print(f"Memory usage after conversion: {df_converted.memory_usage(deep=True).sum()} bytes")

Advanced Type Optimization

# Optimize integer types based on data range
def optimize_integers(df, column):
    """Choose the smallest integer type that fits the data"""
    col_data = df[column].dropna()
    
    if col_data.min() >= 0:  # Unsigned integers
        if col_data.max() <= 255:
            return 'uint8'
        elif col_data.max() <= 65535:
            return 'uint16'
        elif col_data.max() <= 4294967295:
            return 'uint32'
        else:
            return 'uint64'
    else:  # Signed integers
        if col_data.min() >= -128 and col_data.max() <= 127:
            return 'int8'
        elif col_data.min() >= -32768 and col_data.max() <= 32767:
            return 'int16'
        elif col_data.min() >= -2147483648 and col_data.max() <= 2147483647:
            return 'int32'
        else:
            return 'int64'

# Create sample data for optimization
large_data = pd.DataFrame({
    'small_int': np.random.randint(0, 100, 10000),      # Can be uint8
    'medium_int': np.random.randint(0, 50000, 10000),   # Can be uint16
    'scores': np.random.randint(0, 1000, 10000),        # Can be uint16
    'negative_vals': np.random.randint(-50, 50, 10000)  # Can be int8
})

print("Before optimization:")
print(large_data.dtypes)
print(f"Memory usage: {large_data.memory_usage(deep=True).sum():,} bytes")

# Apply optimizations
for col in large_data.columns:
    optimal_type = optimize_integers(large_data, col)
    large_data[col] = large_data[col].astype(optimal_type)

print("\nAfter optimization:")
print(large_data.dtypes)
print(f"Memory usage: {large_data.memory_usage(deep=True).sum():,} bytes")

Category Type for Repeated Strings

# Sample data with repeated categorical values
employee_data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'department': ['IT', 'HR', 'IT', 'Finance', 'HR'] * 2000,  # Repeated values
    'level': ['Junior', 'Senior', 'Mid', 'Senior', 'Junior'] * 2000,
    'location': ['NYC', 'LA', 'Chicago', 'NYC', 'LA'] * 2000
})

print("String columns memory usage:")
string_memory = employee_data.select_dtypes(include=['object']).memory_usage(deep=True).sum()
print(f"String columns: {string_memory:,} bytes")

# Convert to categorical
categorical_data = employee_data.copy()
for col in ['department', 'level', 'location']:
    categorical_data[col] = categorical_data[col].astype('category')

print("\nAfter converting to categorical:")
print(categorical_data.dtypes)
categorical_memory = categorical_data.memory_usage(deep=True).sum()
print(f"Total memory: {categorical_memory:,} bytes")
print(f"Memory saved: {string_memory - categorical_memory:,} bytes ({((string_memory - categorical_memory) / string_memory * 100):.1f}%)")

# Category information
print(f"\nDepartment categories: {categorical_data['department'].cat.categories.tolist()}")
print(f"Number of categories: {len(categorical_data['department'].cat.categories)}")

Date and Time Conversions

# Various date formats
date_formats = pd.DataFrame({
    'date_str1': ['2025-01-15', '2025-02-20', '2025-03-25'],
    'date_str2': ['15/01/2025', '20/02/2025', '25/03/2025'],
    'date_str3': ['Jan 15, 2025', 'Feb 20, 2025', 'Mar 25, 2025'],
    'timestamp': ['2025-01-15 14:30:00', '2025-02-20 09:15:30', '2025-03-25 18:45:15']
})

# Convert different date formats
dates_converted = date_formats.copy()

# Standard ISO format
dates_converted['date_str1'] = pd.to_datetime(dates_converted['date_str1'])

# European format (day/month/year)
dates_converted['date_str2'] = pd.to_datetime(dates_converted['date_str2'], format='%d/%m/%Y')

# Text format
dates_converted['date_str3'] = pd.to_datetime(dates_converted['date_str3'])

# Full timestamp
dates_converted['timestamp'] = pd.to_datetime(dates_converted['timestamp'])

print("Converted date columns:")
print(dates_converted.dtypes)
print(dates_converted)

# Extract date components
dates_converted['year'] = dates_converted['timestamp'].dt.year
dates_converted['month'] = dates_converted['timestamp'].dt.month
dates_converted['day_of_week'] = dates_converted['timestamp'].dt.day_name()

print("\nWith extracted components:")
print(dates_converted[['timestamp', 'year', 'month', 'day_of_week']])

Error Handling in Type Conversion

# Data with conversion errors
problematic_data = pd.DataFrame({
    'numbers': ['123', '456', 'not_a_number', '789', ''],
    'dates': ['2025-01-01', 'invalid_date', '2025-12-31', '2025-02-30', ''],
    'booleans': ['True', 'False', 'maybe', 'Yes', '1']
})

print("Problematic data:")
print(problematic_data)

# Safe numeric conversion
def safe_numeric_conversion(series, default_value=np.nan):
    """Safely convert to numeric with detailed error reporting"""
    converted = pd.to_numeric(series, errors='coerce')
    errors = series[converted.isna() & series.notna()]
    
    if len(errors) > 0:
        print(f"Conversion errors found: {errors.tolist()}")
    
    return converted

# Safe date conversion
def safe_date_conversion(series, default_value=pd.NaT):
    """Safely convert to datetime with error reporting"""
    converted = pd.to_datetime(series, errors='coerce')
    errors = series[converted.isna() & series.notna()]
    
    if len(errors) > 0:
        print(f"Date conversion errors: {errors.tolist()}")
    
    return converted

# Apply safe conversions
safe_converted = problematic_data.copy()
safe_converted['numbers'] = safe_numeric_conversion(safe_converted['numbers'])
safe_converted['dates'] = safe_date_conversion(safe_converted['dates'])

print("\nAfter safe conversion:")
print(safe_converted)
print(safe_converted.dtypes)

Custom Type Conversion Functions

def smart_type_inference(df):
    """Automatically infer and convert optimal data types"""
    df_optimized = df.copy()
    
    for col in df_optimized.columns:
        col_data = df_optimized[col]
        
        # Skip if already numeric or datetime
        if col_data.dtype in ['int64', 'float64', 'datetime64[ns]']:
            continue
            
        # Try numeric conversion
        numeric_converted = pd.to_numeric(col_data, errors='coerce')
        if numeric_converted.notna().sum() > len(col_data) * 0.8:  # 80% success rate
            df_optimized[col] = numeric_converted
            
            # Optimize integer types
            if numeric_converted.notna().all() and (numeric_converted % 1 == 0).all():
                optimal_int_type = optimize_integers(df_optimized, col)
                df_optimized[col] = df_optimized[col].astype(optimal_int_type)
            continue
        
        # Try datetime conversion
        date_converted = pd.to_datetime(col_data, errors='coerce')
        if date_converted.notna().sum() > len(col_data) * 0.8:
            df_optimized[col] = date_converted
            continue
        
        # Try boolean conversion
        if col_data.isin(['True', 'False', 'true', 'false', '1', '0', 'Yes', 'No']).all():
            bool_mapping = {'True': True, 'true': True, '1': True, 'Yes': True,
                          'False': False, 'false': False, '0': False, 'No': False}
            df_optimized[col] = col_data.map(bool_mapping)
            continue
        
        # Convert to category if many repeated values
        if len(col_data.unique()) < len(col_data) * 0.5:  # Less than 50% unique
            df_optimized[col] = col_data.astype('category')
    
    return df_optimized

# Test with mixed data
test_data = pd.DataFrame({
    'integers': ['1', '2', '3', '4', '5'],
    'floats': ['1.5', '2.7', '3.8', '4.2', '5.1'],
    'dates': ['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04', '2025-01-05'],
    'categories': ['A', 'B', 'A', 'C', 'B'],
    'booleans': ['True', 'False', 'True', 'False', 'True']
})

print("Before smart inference:")
print(test_data.dtypes)
print(f"Memory: {test_data.memory_usage(deep=True).sum()} bytes")

smart_converted = smart_type_inference(test_data)
print("\nAfter smart inference:")
print(smart_converted.dtypes)
print(f"Memory: {smart_converted.memory_usage(deep=True).sum()} bytes")

Performance Impact Analysis

import time

# Create large dataset for performance testing
np.random.seed(42)
large_test_data = pd.DataFrame({
    'category_col': np.random.choice(['A', 'B', 'C', 'D'], 100000),
    'float_col': np.random.randn(100000),
    'int_col': np.random.randint(0, 1000, 100000)
})

# Performance test: string vs category
print("Performance comparison: string vs category")

# String operations
start_time = time.time()
string_groupby = large_test_data.groupby('category_col')['float_col'].mean()
string_time = time.time() - start_time

# Convert to category and test
large_test_data['category_col'] = large_test_data['category_col'].astype('category')
start_time = time.time()
category_groupby = large_test_data.groupby('category_col')['float_col'].mean()
category_time = time.time() - start_time

print(f"String groupby time: {string_time:.4f}s")
print(f"Category groupby time: {category_time:.4f}s")
print(f"Performance improvement: {string_time/category_time:.2f}x faster")

Type Conversion Best Practices

  1. Analyze data first - understand your data before converting
  2. Handle errors gracefully - use errors='coerce' for safe conversion
  3. Optimize memory usage - choose appropriate integer and float types
  4. Use categories for repeated strings - significant memory savings
  5. Validate after conversion - ensure data integrity is maintained

Master Data Optimization

Explore advanced pandas performance tuning, learn memory profiling techniques, and discover data pipeline optimization strategies.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python