Table Of Contents
- Missing Data, Not Missing Models
- Simple Imputation Strategies
- Numeric Imputation Pipeline
- Different Imputation Strategies
- Advanced Imputation Methods
- Mixed Data Type Imputation
- Custom Imputation Functions
- Missing Value Indicators
- Handling Missing Values in Cross-Validation
- Domain-Specific Imputation
- Evaluating Imputation Quality
- Production Pipeline Template
- Best Practices
- Master Data Quality
Missing Data, Not Missing Models
Missing values can break ML pipelines and bias results. Learn to handle missing data systematically within your ML workflows for reliable predictions.
Simple Imputation Strategies
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
# Create data with missing values
np.random.seed(42)
data = pd.DataFrame({
'numeric1': np.random.randn(1000),
'numeric2': np.random.randn(1000),
'categorical': np.random.choice(['A', 'B', 'C'], 1000),
'target': np.random.randint(0, 2, 1000)
})
# Introduce missing values
missing_indices = np.random.choice(1000, 150, replace=False)
data.loc[missing_indices[:50], 'numeric1'] = np.nan
data.loc[missing_indices[50:100], 'numeric2'] = np.nan
data.loc[missing_indices[100:], 'categorical'] = np.nan
print(f"Missing values per column:")
print(data.isnull().sum())
# Separate features and target
X = data.drop('target', axis=1)
y = data['target']
Numeric Imputation Pipeline
from sklearn.model_selection import train_test_split
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Pipeline with numeric imputation
numeric_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')), # or 'mean', 'most_frequent'
('classifier', LogisticRegression(random_state=42))
])
# Fit only on numeric columns for now
X_train_numeric = X_train[['numeric1', 'numeric2']]
X_test_numeric = X_test[['numeric1', 'numeric2']]
numeric_pipeline.fit(X_train_numeric, y_train)
score = numeric_pipeline.score(X_test_numeric, y_test)
print(f"Numeric pipeline accuracy: {score:.3f}")
Different Imputation Strategies
# Compare imputation strategies
strategies = ['mean', 'median', 'most_frequent', 'constant']
results = {}
for strategy in strategies:
if strategy == 'constant':
imputer = SimpleImputer(strategy=strategy, fill_value=0)
else:
imputer = SimpleImputer(strategy=strategy)
pipeline = Pipeline([
('imputer', imputer),
('classifier', LogisticRegression(random_state=42))
])
pipeline.fit(X_train_numeric, y_train)
score = pipeline.score(X_test_numeric, y_test)
results[strategy] = score
print("Imputation strategy comparison:")
for strategy, score in results.items():
print(f"{strategy}: {score:.3f}")
Advanced Imputation Methods
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
# Iterative imputation (MICE-like)
iterative_pipeline = Pipeline([
('imputer', IterativeImputer(random_state=42)),
('classifier', LogisticRegression(random_state=42))
])
# KNN imputation
knn_pipeline = Pipeline([
('imputer', KNNImputer(n_neighbors=5)),
('classifier', LogisticRegression(random_state=42))
])
# Compare advanced methods
advanced_methods = {
'Iterative': iterative_pipeline,
'KNN': knn_pipeline
}
for name, pipeline in advanced_methods.items():
pipeline.fit(X_train_numeric, y_train)
score = pipeline.score(X_test_numeric, y_test)
print(f"{name} imputation: {score:.3f}")
Mixed Data Type Imputation
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Define preprocessing for mixed data types
numeric_features = ['numeric1', 'numeric2']
categorical_features = ['categorical']
# Preprocessing pipeline
preprocessor = ColumnTransformer(
transformers=[
('num', SimpleImputer(strategy='median'), numeric_features),
('cat', Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
]), categorical_features)
]
)
# Complete pipeline
mixed_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42))
])
mixed_pipeline.fit(X_train, y_train)
mixed_score = mixed_pipeline.score(X_test, y_test)
print(f"Mixed data pipeline: {mixed_score:.3f}")
Custom Imputation Functions
from sklearn.base import BaseEstimator, TransformerMixin
class CustomImputer(BaseEstimator, TransformerMixin):
def __init__(self, strategy='median'):
self.strategy = strategy
def fit(self, X, y=None):
if self.strategy == 'median':
self.fill_values_ = X.median()
elif self.strategy == 'mode':
self.fill_values_ = X.mode().iloc[0]
elif self.strategy == 'interpolate':
# Simple linear interpolation
self.fill_values_ = X.mean() # Fallback for demo
return self
def transform(self, X):
X_imputed = X.copy()
for column in X.columns:
X_imputed[column].fillna(self.fill_values_[column], inplace=True)
return X_imputed
# Use custom imputer
custom_pipeline = Pipeline([
('imputer', CustomImputer(strategy='median')),
('classifier', LogisticRegression(random_state=42))
])
custom_pipeline.fit(X_train_numeric, y_train)
custom_score = custom_pipeline.score(X_test_numeric, y_test)
print(f"Custom imputation: {custom_score:.3f}")
Missing Value Indicators
from sklearn.impute import MissingIndicator
from sklearn.compose import make_column_transformer
# Add missing indicators
imputer_with_indicator = Pipeline([
('imputer', SimpleImputer(strategy='median', add_indicator=True)),
('classifier', LogisticRegression(random_state=42))
])
imputer_with_indicator.fit(X_train_numeric, y_train)
indicator_score = imputer_with_indicator.score(X_test_numeric, y_test)
print(f"Imputation with indicators: {indicator_score:.3f}")
# Separate missing indicator
missing_indicator = MissingIndicator()
X_train_indicators = missing_indicator.fit_transform(X_train_numeric)
print(f"Missing indicators shape: {X_train_indicators.shape}")
Handling Missing Values in Cross-Validation
from sklearn.model_selection import cross_val_score
# Cross-validation with imputation pipeline
cv_scores = cross_val_score(
mixed_pipeline, X_train, y_train, cv=5, scoring='accuracy'
)
print(f"CV scores with imputation: {cv_scores.round(3)}")
print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
Domain-Specific Imputation
def domain_specific_imputer(X, domain_rules=None):
"""Apply domain-specific imputation rules"""
X_imputed = X.copy()
if domain_rules:
for column, rule in domain_rules.items():
if rule['type'] == 'business_logic':
# Example: Replace missing age with median by category
if 'group_by' in rule:
group_medians = X_imputed.groupby(rule['group_by'])[column].median()
for group, median in group_medians.items():
mask = (X_imputed[rule['group_by']] == group) & X_imputed[column].isnull()
X_imputed.loc[mask, column] = median
elif rule['type'] == 'seasonal':
# Example: Use seasonal patterns for time series
X_imputed[column].fillna(method='ffill', inplace=True)
return X_imputed
# Example domain rules
domain_rules = {
'numeric1': {
'type': 'business_logic',
'group_by': 'categorical'
}
}
# Apply domain-specific imputation
X_domain = domain_specific_imputer(X_train, domain_rules)
print(f"Domain imputation - remaining missing: {X_domain.isnull().sum().sum()}")
Evaluating Imputation Quality
from sklearn.metrics import mean_squared_error
def evaluate_imputation_quality(X_original, X_imputed, missing_mask):
"""Evaluate how well imputation recovered original values"""
# Only evaluate where we artificially introduced missing values
original_values = X_original[missing_mask]
imputed_values = X_imputed[missing_mask]
# Calculate MSE for continuous variables
mse = mean_squared_error(original_values, imputed_values)
return mse
# Create test case with known missing values
X_complete = X_train_numeric.copy().fillna(X_train_numeric.median())
missing_mask = X_train_numeric.isnull()
# Test different imputation methods
imputers = {
'median': SimpleImputer(strategy='median'),
'knn': KNNImputer(n_neighbors=5)
}
for name, imputer in imputers.items():
X_imputed = imputer.fit_transform(X_train_numeric)
# Convert back to DataFrame for evaluation
X_imputed_df = pd.DataFrame(X_imputed, columns=X_train_numeric.columns,
index=X_train_numeric.index)
print(f"{name} imputation quality:")
for col in X_train_numeric.columns:
if missing_mask[col].any():
col_mse = mean_squared_error(
X_complete.loc[missing_mask[col], col],
X_imputed_df.loc[missing_mask[col], col]
)
print(f" {col} MSE: {col_mse:.3f}")
Production Pipeline Template
def create_robust_imputation_pipeline():
"""Create production-ready pipeline with imputation"""
# Define column types
numeric_columns = ['numeric1', 'numeric2']
categorical_columns = ['categorical']
# Numeric preprocessing
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
# Add scaling if needed
])
# Categorical preprocessing
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
# Combine preprocessing
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_columns),
('cat', categorical_transformer, categorical_columns)
]
)
# Final pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42, max_iter=1000))
])
return pipeline
# Create and test production pipeline
production_pipeline = create_robust_imputation_pipeline()
production_pipeline.fit(X_train, y_train)
production_score = production_pipeline.score(X_test, y_test)
print(f"Production pipeline accuracy: {production_score:.3f}")
Best Practices
- Fit imputers only on training data to prevent data leakage
- Consider missing indicators for tree-based models
- Use domain knowledge for imputation strategy selection
- Validate imputation quality before model training
- Document imputation choices for reproducibility
Master Data Quality
Explore advanced missing data patterns, learn multiple imputation techniques, and discover automated data cleaning pipelines.
Share this article
Add Comment
No comments yet. Be the first to comment!