Navigation

Python

How to Handle Missing Values in ML Pipelines

Integrate missing value handling seamlessly into machine learning pipelines using scikit-learn's imputation strategies for robust model training.

Table Of Contents

Missing Data, Not Missing Models

Missing values can break ML pipelines and bias results. Learn to handle missing data systematically within your ML workflows for reliable predictions.

Simple Imputation Strategies

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Create data with missing values
np.random.seed(42)
data = pd.DataFrame({
    'numeric1': np.random.randn(1000),
    'numeric2': np.random.randn(1000),
    'categorical': np.random.choice(['A', 'B', 'C'], 1000),
    'target': np.random.randint(0, 2, 1000)
})

# Introduce missing values
missing_indices = np.random.choice(1000, 150, replace=False)
data.loc[missing_indices[:50], 'numeric1'] = np.nan
data.loc[missing_indices[50:100], 'numeric2'] = np.nan
data.loc[missing_indices[100:], 'categorical'] = np.nan

print(f"Missing values per column:")
print(data.isnull().sum())

# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

Numeric Imputation Pipeline

from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline with numeric imputation
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # or 'mean', 'most_frequent'
    ('classifier', LogisticRegression(random_state=42))
])

# Fit only on numeric columns for now
X_train_numeric = X_train[['numeric1', 'numeric2']]
X_test_numeric = X_test[['numeric1', 'numeric2']]

numeric_pipeline.fit(X_train_numeric, y_train)
score = numeric_pipeline.score(X_test_numeric, y_test)
print(f"Numeric pipeline accuracy: {score:.3f}")

Different Imputation Strategies

# Compare imputation strategies
strategies = ['mean', 'median', 'most_frequent', 'constant']
results = {}

for strategy in strategies:
    if strategy == 'constant':
        imputer = SimpleImputer(strategy=strategy, fill_value=0)
    else:
        imputer = SimpleImputer(strategy=strategy)
    
    pipeline = Pipeline([
        ('imputer', imputer),
        ('classifier', LogisticRegression(random_state=42))
    ])
    
    pipeline.fit(X_train_numeric, y_train)
    score = pipeline.score(X_test_numeric, y_test)
    results[strategy] = score

print("Imputation strategy comparison:")
for strategy, score in results.items():
    print(f"{strategy}: {score:.3f}")

Advanced Imputation Methods

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

# Iterative imputation (MICE-like)
iterative_pipeline = Pipeline([
    ('imputer', IterativeImputer(random_state=42)),
    ('classifier', LogisticRegression(random_state=42))
])

# KNN imputation
knn_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('classifier', LogisticRegression(random_state=42))
])

# Compare advanced methods
advanced_methods = {
    'Iterative': iterative_pipeline,
    'KNN': knn_pipeline
}

for name, pipeline in advanced_methods.items():
    pipeline.fit(X_train_numeric, y_train)
    score = pipeline.score(X_test_numeric, y_test)
    print(f"{name} imputation: {score:.3f}")

Mixed Data Type Imputation

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define preprocessing for mixed data types
numeric_features = ['numeric1', 'numeric2']
categorical_features = ['categorical']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Complete pipeline
mixed_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

mixed_pipeline.fit(X_train, y_train)
mixed_score = mixed_pipeline.score(X_test, y_test)
print(f"Mixed data pipeline: {mixed_score:.3f}")

Custom Imputation Functions

from sklearn.base import BaseEstimator, TransformerMixin

class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy
        
    def fit(self, X, y=None):
        if self.strategy == 'median':
            self.fill_values_ = X.median()
        elif self.strategy == 'mode':
            self.fill_values_ = X.mode().iloc[0]
        elif self.strategy == 'interpolate':
            # Simple linear interpolation
            self.fill_values_ = X.mean()  # Fallback for demo
        return self
    
    def transform(self, X):
        X_imputed = X.copy()
        for column in X.columns:
            X_imputed[column].fillna(self.fill_values_[column], inplace=True)
        return X_imputed

# Use custom imputer
custom_pipeline = Pipeline([
    ('imputer', CustomImputer(strategy='median')),
    ('classifier', LogisticRegression(random_state=42))
])

custom_pipeline.fit(X_train_numeric, y_train)
custom_score = custom_pipeline.score(X_test_numeric, y_test)
print(f"Custom imputation: {custom_score:.3f}")

Missing Value Indicators

from sklearn.impute import MissingIndicator
from sklearn.compose import make_column_transformer

# Add missing indicators
imputer_with_indicator = Pipeline([
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('classifier', LogisticRegression(random_state=42))
])

imputer_with_indicator.fit(X_train_numeric, y_train)
indicator_score = imputer_with_indicator.score(X_test_numeric, y_test)
print(f"Imputation with indicators: {indicator_score:.3f}")

# Separate missing indicator
missing_indicator = MissingIndicator()
X_train_indicators = missing_indicator.fit_transform(X_train_numeric)
print(f"Missing indicators shape: {X_train_indicators.shape}")

Handling Missing Values in Cross-Validation

from sklearn.model_selection import cross_val_score

# Cross-validation with imputation pipeline
cv_scores = cross_val_score(
    mixed_pipeline, X_train, y_train, cv=5, scoring='accuracy'
)

print(f"CV scores with imputation: {cv_scores.round(3)}")
print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

Domain-Specific Imputation

def domain_specific_imputer(X, domain_rules=None):
    """Apply domain-specific imputation rules"""
    
    X_imputed = X.copy()
    
    if domain_rules:
        for column, rule in domain_rules.items():
            if rule['type'] == 'business_logic':
                # Example: Replace missing age with median by category
                if 'group_by' in rule:
                    group_medians = X_imputed.groupby(rule['group_by'])[column].median()
                    for group, median in group_medians.items():
                        mask = (X_imputed[rule['group_by']] == group) & X_imputed[column].isnull()
                        X_imputed.loc[mask, column] = median
            
            elif rule['type'] == 'seasonal':
                # Example: Use seasonal patterns for time series
                X_imputed[column].fillna(method='ffill', inplace=True)
    
    return X_imputed

# Example domain rules
domain_rules = {
    'numeric1': {
        'type': 'business_logic',
        'group_by': 'categorical'
    }
}

# Apply domain-specific imputation
X_domain = domain_specific_imputer(X_train, domain_rules)
print(f"Domain imputation - remaining missing: {X_domain.isnull().sum().sum()}")

Evaluating Imputation Quality

from sklearn.metrics import mean_squared_error

def evaluate_imputation_quality(X_original, X_imputed, missing_mask):
    """Evaluate how well imputation recovered original values"""
    
    # Only evaluate where we artificially introduced missing values
    original_values = X_original[missing_mask]
    imputed_values = X_imputed[missing_mask]
    
    # Calculate MSE for continuous variables
    mse = mean_squared_error(original_values, imputed_values)
    
    return mse

# Create test case with known missing values
X_complete = X_train_numeric.copy().fillna(X_train_numeric.median())
missing_mask = X_train_numeric.isnull()

# Test different imputation methods
imputers = {
    'median': SimpleImputer(strategy='median'),
    'knn': KNNImputer(n_neighbors=5)
}

for name, imputer in imputers.items():
    X_imputed = imputer.fit_transform(X_train_numeric)
    # Convert back to DataFrame for evaluation
    X_imputed_df = pd.DataFrame(X_imputed, columns=X_train_numeric.columns, 
                               index=X_train_numeric.index)
    
    print(f"{name} imputation quality:")
    for col in X_train_numeric.columns:
        if missing_mask[col].any():
            col_mse = mean_squared_error(
                X_complete.loc[missing_mask[col], col],
                X_imputed_df.loc[missing_mask[col], col]
            )
            print(f"  {col} MSE: {col_mse:.3f}")

Production Pipeline Template

def create_robust_imputation_pipeline():
    """Create production-ready pipeline with imputation"""
    
    # Define column types
    numeric_columns = ['numeric1', 'numeric2']
    categorical_columns = ['categorical']
    
    # Numeric preprocessing
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        # Add scaling if needed
    ])
    
    # Categorical preprocessing
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    
    # Combine preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_columns),
            ('cat', categorical_transformer, categorical_columns)
        ]
    )
    
    # Final pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    return pipeline

# Create and test production pipeline
production_pipeline = create_robust_imputation_pipeline()
production_pipeline.fit(X_train, y_train)
production_score = production_pipeline.score(X_test, y_test)
print(f"Production pipeline accuracy: {production_score:.3f}")

Best Practices

  • Fit imputers only on training data to prevent data leakage
  • Consider missing indicators for tree-based models
  • Use domain knowledge for imputation strategy selection
  • Validate imputation quality before model training
  • Document imputation choices for reproducibility

Master Data Quality

Explore advanced missing data patterns, learn multiple imputation techniques, and discover automated data cleaning pipelines.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python