Navigation

Python

How to Handle Imbalanced Datasets

Address class imbalance in machine learning with resampling techniques and specialized algorithms for better minority class prediction.

Table Of Contents

When Classes Aren't Equal

Imbalanced datasets where one class dominates can lead to biased models. Learn techniques to handle class imbalance effectively.

Understanding Class Imbalance

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from collections import Counter

# Create imbalanced dataset
X, y = make_classification(
    n_samples=1000, n_features=10, n_classes=2, 
    weights=[0.9, 0.1], random_state=42
)

print("Class distribution:")
print(Counter(y))
print(f"Imbalance ratio: {Counter(y)[0] / Counter(y)[1]:.1f}:1")

SMOTE (Synthetic Minority Oversampling)

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split data first
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Before SMOTE:")
print(Counter(y_train))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE:")
print(Counter(y_train_smote))

Random Oversampling and Undersampling

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_train_over, y_train_over = ros.fit_resample(X_train, y_train)

# Random Undersampling  
rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)

print("Oversampling:", Counter(y_train_over))
print("Undersampling:", Counter(y_train_under))

Class Weight Adjustment

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Model with balanced class weights
model_balanced = LogisticRegression(class_weight='balanced', random_state=42)
model_balanced.fit(X_train, y_train)

# Regular model
model_regular = LogisticRegression(random_state=42)
model_regular.fit(X_train, y_train)

# Compare predictions
y_pred_balanced = model_balanced.predict(X_test)
y_pred_regular = model_regular.predict(X_test)

print("Balanced model:")
print(classification_report(y_test, y_pred_balanced))

Evaluation Metrics for Imbalanced Data

from sklearn.metrics import (
    confusion_matrix, precision_recall_curve, 
    roc_auc_score, f1_score, balanced_accuracy_score
)

def evaluate_imbalanced(y_true, y_pred, y_prob=None):
    """Comprehensive evaluation for imbalanced datasets"""
    
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    
    print(f"\nBalanced Accuracy: {balanced_accuracy_score(y_true, y_pred):.3f}")
    print(f"F1-Score: {f1_score(y_true, y_pred):.3f}")
    
    if y_prob is not None:
        print(f"ROC-AUC: {roc_auc_score(y_true, y_prob):.3f}")

# Evaluate models
y_prob_balanced = model_balanced.predict_proba(X_test)[:, 1]
evaluate_imbalanced(y_test, y_pred_balanced, y_prob_balanced)

Combined Sampling Strategies

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline

# SMOTE + Tomek links (clean overlapping samples)
smote_tomek = SMOTETomek(random_state=42)
X_train_combined, y_train_combined = smote_tomek.fit_resample(X_train, y_train)

# Pipeline with resampling
pipeline = ImbPipeline([
    ('sampler', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(random_state=42))
])

# Fit pipeline
pipeline.fit(X_train, y_train)
y_pred_pipeline = pipeline.predict(X_test)

print("Combined sampling:")
print(Counter(y_train_combined))

Threshold Tuning

from sklearn.metrics import precision_recall_curve

# Get prediction probabilities
y_prob = model_balanced.predict_proba(X_test)[:, 1]

# Find optimal threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)

# Best threshold for F1-score
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]

print(f"Best threshold: {best_threshold:.3f}")
print(f"Best F1-score: {f1_scores[best_threshold_idx]:.3f}")

# Apply threshold
y_pred_tuned = (y_prob >= best_threshold).astype(int)
evaluate_imbalanced(y_test, y_pred_tuned, y_prob)

Ensemble Methods for Imbalanced Data

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Regular Random Forest
rf_regular = RandomForestClassifier(n_estimators=100, random_state=42)
rf_regular.fit(X_train, y_train)

# Balanced Random Forest
rf_balanced = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
rf_balanced.fit(X_train, y_train)

# Compare performance
y_pred_rf_reg = rf_regular.predict(X_test)
y_pred_rf_bal = rf_balanced.predict(X_test)

print("Regular RF F1-score:", f1_score(y_test, y_pred_rf_reg))
print("Balanced RF F1-score:", f1_score(y_test, y_pred_rf_bal))

Cost-Sensitive Learning

from sklearn.svm import SVC

# Define class weights manually
class_weights = {0: 1, 1: 9}  # Penalize minority class errors more

svm_weighted = SVC(class_weight=class_weights, probability=True, random_state=42)
svm_weighted.fit(X_train, y_train)

y_pred_svm = svm_weighted.predict(X_test)
print("Cost-sensitive SVM F1-score:", f1_score(y_test, y_pred_svm))

Advanced Sampling Techniques

from imblearn.over_sampling import ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours

# ADASYN (Adaptive Synthetic Sampling)
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

# Borderline SMOTE
borderline_smote = BorderlineSMOTE(random_state=42)
X_borderline, y_borderline = borderline_smote.fit_resample(X_train, y_train)

# Edited Nearest Neighbours (undersampling)
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_train, y_train)

print("ADASYN:", Counter(y_adasyn))
print("Borderline SMOTE:", Counter(y_borderline))
print("ENN:", Counter(y_enn))

Real-World Example

# Simulate credit fraud detection
fraud_data = pd.DataFrame({
    'amount': np.random.exponential(100, 10000),
    'merchant_risk': np.random.uniform(0, 1, 10000),
    'time_since_last': np.random.exponential(24, 10000),
    'is_fraud': np.random.choice([0, 1], 10000, p=[0.995, 0.005])
})

X_fraud = fraud_data[['amount', 'merchant_risk', 'time_since_last']]
y_fraud = fraud_data['is_fraud']

print("Fraud detection dataset:")
print(Counter(y_fraud))

# Apply SMOTE
smote_fraud = SMOTE(random_state=42)
X_fraud_smote, y_fraud_smote = smote_fraud.fit_resample(X_fraud, y_fraud)

print("After SMOTE:")
print(Counter(y_fraud_smote))

Choosing the Right Technique

  • Small datasets: Use SMOTE or oversampling
  • Large datasets: Use undersampling or class weights
  • Noisy data: Use SMOTE + cleaning methods
  • Tree-based models: Use balanced versions or class weights
  • Always validate: Use stratified cross-validation

Master Imbalanced Learning

Explore anomaly detection techniques, learn advanced ensemble methods, and discover deep learning for imbalanced data.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python