Navigation

Python

How to Use Cross-Validation for Model Evaluation

Evaluate model performance reliably with scikit-learn's cross-validation techniques to avoid overfitting and get honest performance estimates.

Table Of Contents

Beyond Single Train/Test Splits

Single train/test splits can be misleading. Cross-validation provides robust performance estimates by testing on multiple data partitions.

Basic K-Fold Cross-Validation

from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import numpy as np

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Create model
model = LogisticRegression(random_state=42)

# 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

print(f"CV Scores: {cv_scores.round(3)}")
print(f"Mean CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

Different CV Strategies

from sklearn.model_selection import StratifiedKFold, LeaveOneOut, ShuffleSplit

# Stratified K-Fold (maintains class distribution)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(model, X, y, cv=skf)

# Shuffle Split (random sampling)
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
shuffle_scores = cross_val_score(model, X, y, cv=ss)

# Leave-One-Out (for small datasets)
loo = LeaveOneOut()
loo_scores = cross_val_score(model, X[:50], y[:50], cv=loo)  # Small sample

print(f"Stratified: {stratified_scores.mean():.3f}")
print(f"Shuffle Split: {shuffle_scores.mean():.3f}")
print(f"Leave-One-Out: {loo_scores.mean():.3f}")

Time Series Cross-Validation

from sklearn.model_selection import TimeSeriesSplit
import pandas as pd

# Time series data
dates = pd.date_range('2020-01-01', periods=1000, freq='D')
ts_X = np.random.randn(1000, 5)
ts_y = np.random.randn(1000)

# Time series split
tscv = TimeSeriesSplit(n_splits=5)
ts_scores = cross_val_score(model, ts_X, ts_y, cv=tscv)

print(f"Time Series CV: {ts_scores.mean():.3f}")

# Visualize splits
for i, (train_idx, test_idx) in enumerate(tscv.split(ts_X)):
    print(f"Fold {i+1}: Train {len(train_idx)}, Test {len(test_idx)}")

Custom Scoring Metrics

from sklearn.metrics import make_scorer, precision_score, recall_score

# Multiple metrics
from sklearn.model_selection import cross_validate

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted')
}

# Cross-validate with multiple metrics
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)

for metric in scoring:
    scores = cv_results[f'test_{metric}']
    print(f"{metric.title()}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

Nested Cross-Validation

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Nested CV for hyperparameter tuning + model evaluation
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1]}

# Inner loop: hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
clf = GridSearchCV(SVC(), param_grid, cv=inner_cv, scoring='accuracy')

# Outer loop: model evaluation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nested_scores = cross_val_score(clf, X, y, cv=outer_cv)

print(f"Nested CV Score: {nested_scores.mean():.3f} (+/- {nested_scores.std() * 2:.3f})")

Cross-Validation with Pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(k=10)),
    ('classifier', LogisticRegression(random_state=42))
])

# Cross-validate pipeline
pipeline_scores = cross_val_score(pipeline, X, y, cv=5)
print(f"Pipeline CV Score: {pipeline_scores.mean():.3f}")

Learning Curves

from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Generate learning curve
train_sizes, train_scores, val_scores = learning_curve(
    model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate means and stds
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

print("Training sizes:", train_sizes)
print("Validation scores:", val_mean.round(3))

Cross-Validation for Regression

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)

# Regression model
reg_model = LinearRegression()

# Custom scoring for regression
def rmse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return np.sqrt(mean_squared_error(y, y_pred))

# Cross-validation with custom scorer
rmse_scores = cross_val_score(
    reg_model, X_reg, y_reg, cv=5, 
    scoring=make_scorer(rmse_scorer, greater_is_better=False)
)

r2_scores = cross_val_score(reg_model, X_reg, y_reg, cv=5, scoring='r2')

print(f"RMSE: {-rmse_scores.mean():.3f}")
print(f"R²: {r2_scores.mean():.3f}")

Group-Based Cross-Validation

from sklearn.model_selection import GroupKFold
import numpy as np

# Data with groups (e.g., different patients)
groups = np.random.randint(0, 50, 1000)  # 50 different groups

# Group K-Fold ensures same group doesn't appear in train and test
group_kfold = GroupKFold(n_splits=5)
group_scores = cross_val_score(model, X, y, cv=group_kfold, groups=groups)

print(f"Group K-Fold CV: {group_scores.mean():.3f}")

Validation Curve

from sklearn.model_selection import validation_curve

# Validation curve for hyperparameter
param_range = [0.001, 0.01, 0.1, 1, 10, 100]
train_scores, val_scores = validation_curve(
    SVC(), X[:200], y[:200], param_name='C', param_range=param_range, cv=3
)

print("C values:", param_range)
print("Validation scores:", val_scores.mean(axis=1).round(3))

Best Practices

  • Use stratified K-fold for classification
  • Use time series split for temporal data
  • Nested CV for hyperparameter tuning
  • 5-10 folds for most datasets
  • Consistent random states for reproducibility

Master Model Selection

Explore hyperparameter optimization, learn model comparison techniques, and discover ensemble methods.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python