Table Of Contents
- Categories to Numbers
- Label Encoding
- One-Hot Encoding
- Pandas get_dummies
- Ordinal Encoding
- Target Encoding
- Handling High Cardinality
- Complete Preprocessing Pipeline
- Binary Encoding
- Best Practices
- Master Feature Engineering
Categories to Numbers
Machine learning algorithms need numbers, not categories. Master encoding techniques to convert categorical variables into ML-ready numerical features.
Label Encoding
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
# Sample categorical data
data = pd.DataFrame({
'color': ['red', 'blue', 'green', 'red', 'blue'],
'size': ['S', 'M', 'L', 'M', 'S'],
'price': [10, 15, 20, 12, 8]
})
# Label encoding for ordinal data
le_size = LabelEncoder()
data['size_encoded'] = le_size.fit_transform(data['size'])
print("Label encoding (size):")
print(data[['size', 'size_encoded']])
print("Classes:", le_size.classes_)
One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
# One-hot encoding for nominal data
ohe = OneHotEncoder(sparse_output=False)
color_encoded = ohe.fit_transform(data[['color']])
# Create column names
feature_names = ohe.get_feature_names_out(['color'])
color_df = pd.DataFrame(color_encoded, columns=feature_names)
print("One-hot encoding (color):")
print(pd.concat([data[['color']], color_df], axis=1))
Pandas get_dummies
# Quick one-hot encoding with pandas
encoded_data = pd.get_dummies(data, columns=['color', 'size'])
print("Pandas get_dummies:")
print(encoded_data)
# With prefix
encoded_prefix = pd.get_dummies(data, columns=['color'], prefix='color')
print("\nWith prefix:")
print(encoded_prefix.head())
Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
# For ordered categories
size_order = [['S', 'M', 'L']] # Define order
ordinal_enc = OrdinalEncoder(categories=size_order)
data['size_ordinal'] = ordinal_enc.fit_transform(data[['size']])
print("Ordinal encoding:")
print(data[['size', 'size_ordinal']])
Target Encoding
# Target encoding (mean encoding)
def target_encode(X_train, y_train, X_test, column):
# Calculate means for each category
means = X_train.groupby(column)[y_train.name].mean()
# Map to training data
X_train_encoded = X_train[column].map(means)
# Map to test data (use overall mean for unseen categories)
overall_mean = y_train.mean()
X_test_encoded = X_test[column].map(means).fillna(overall_mean)
return X_train_encoded, X_test_encoded
# Example with target variable
data_with_target = pd.DataFrame({
'category': ['A', 'B', 'A', 'C', 'B', 'A'],
'target': [10, 20, 15, 30, 25, 12]
})
# Split data
train_data = data_with_target[:4]
test_data = data_with_target[4:]
# Apply target encoding
train_encoded, test_encoded = target_encode(
train_data, train_data['target'], test_data, 'category'
)
print("Target encoding:")
print("Train:", train_encoded.values)
print("Test:", test_encoded.values)
Handling High Cardinality
# High cardinality categorical data
high_card_data = pd.DataFrame({
'city': np.random.choice([f'City_{i}' for i in range(100)], 1000),
'target': np.random.randn(1000)
})
# Top categories + "Other"
top_cities = high_card_data['city'].value_counts().head(10).index
high_card_data['city_grouped'] = high_card_data['city'].apply(
lambda x: x if x in top_cities else 'Other'
)
print(f"Original categories: {high_card_data['city'].nunique()}")
print(f"Grouped categories: {high_card_data['city_grouped'].nunique()}")
Complete Preprocessing Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Mixed dataset
mixed_data = pd.DataFrame({
'numerical': np.random.randn(1000),
'ordinal': np.random.choice(['Low', 'Medium', 'High'], 1000),
'nominal': np.random.choice(['A', 'B', 'C', 'D'], 1000),
'target': np.random.randint(0, 2, 1000)
})
# Separate features and target
X = mixed_data.drop('target', axis=1)
y = mixed_data['target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Define preprocessing
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['numerical']),
('ord', OrdinalEncoder(categories=[['Low', 'Medium', 'High']]), ['ordinal']),
('nom', OneHotEncoder(drop='first'), ['nominal'])
]
)
# Fit and transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"Original shape: {X_train.shape}")
print(f"Processed shape: {X_train_processed.shape}")
Binary Encoding
# Binary encoding for high cardinality
def binary_encode(series):
"""Simple binary encoding"""
le = LabelEncoder()
encoded = le.fit_transform(series)
# Convert to binary
max_val = max(encoded)
n_bits = len(bin(max_val)) - 2 # Remove '0b'
binary_matrix = np.array([
[int(b) for b in format(val, f'0{n_bits}b')]
for val in encoded
])
return binary_matrix
# Example
categories = ['A', 'B', 'C', 'D', 'E']
binary_encoded = binary_encode(categories)
print("Binary encoding:")
for i, cat in enumerate(categories):
print(f"{cat}: {binary_encoded[i]}")
Best Practices
- Ordinal data: Use OrdinalEncoder or LabelEncoder
- Nominal data: Use OneHotEncoder or get_dummies
- High cardinality: Group rare categories or use target encoding
- Always fit on training data only
- Handle unseen categories in test data
Master Feature Engineering
Explore advanced preprocessing techniques, learn feature selection methods, and discover automated ML pipelines.
Share this article
Add Comment
No comments yet. Be the first to comment!