Table Of Contents
- Beyond Single Train/Test Splits
- Basic K-Fold Cross-Validation
- Different CV Strategies
- Time Series Cross-Validation
- Custom Scoring Metrics
- Nested Cross-Validation
- Cross-Validation with Pipelines
- Learning Curves
- Cross-Validation for Regression
- Group-Based Cross-Validation
- Validation Curve
- Best Practices
- Master Model Selection
Beyond Single Train/Test Splits
Single train/test splits can be misleading. Cross-validation provides robust performance estimates by testing on multiple data partitions.
Basic K-Fold Cross-Validation
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import numpy as np
# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
# Create model
model = LogisticRegression(random_state=42)
# 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"CV Scores: {cv_scores.round(3)}")
print(f"Mean CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
Different CV Strategies
from sklearn.model_selection import StratifiedKFold, LeaveOneOut, ShuffleSplit
# Stratified K-Fold (maintains class distribution)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(model, X, y, cv=skf)
# Shuffle Split (random sampling)
ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
shuffle_scores = cross_val_score(model, X, y, cv=ss)
# Leave-One-Out (for small datasets)
loo = LeaveOneOut()
loo_scores = cross_val_score(model, X[:50], y[:50], cv=loo) # Small sample
print(f"Stratified: {stratified_scores.mean():.3f}")
print(f"Shuffle Split: {shuffle_scores.mean():.3f}")
print(f"Leave-One-Out: {loo_scores.mean():.3f}")
Time Series Cross-Validation
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
# Time series data
dates = pd.date_range('2020-01-01', periods=1000, freq='D')
ts_X = np.random.randn(1000, 5)
ts_y = np.random.randn(1000)
# Time series split
tscv = TimeSeriesSplit(n_splits=5)
ts_scores = cross_val_score(model, ts_X, ts_y, cv=tscv)
print(f"Time Series CV: {ts_scores.mean():.3f}")
# Visualize splits
for i, (train_idx, test_idx) in enumerate(tscv.split(ts_X)):
print(f"Fold {i+1}: Train {len(train_idx)}, Test {len(test_idx)}")
Custom Scoring Metrics
from sklearn.metrics import make_scorer, precision_score, recall_score
# Multiple metrics
from sklearn.model_selection import cross_validate
# Define scoring metrics
scoring = {
'accuracy': 'accuracy',
'precision': make_scorer(precision_score, average='weighted'),
'recall': make_scorer(recall_score, average='weighted')
}
# Cross-validate with multiple metrics
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
for metric in scoring:
scores = cv_results[f'test_{metric}']
print(f"{metric.title()}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
Nested Cross-Validation
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# Nested CV for hyperparameter tuning + model evaluation
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1]}
# Inner loop: hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
clf = GridSearchCV(SVC(), param_grid, cv=inner_cv, scoring='accuracy')
# Outer loop: model evaluation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nested_scores = cross_val_score(clf, X, y, cv=outer_cv)
print(f"Nested CV Score: {nested_scores.mean():.3f} (+/- {nested_scores.std() * 2:.3f})")
Cross-Validation with Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('selector', SelectKBest(k=10)),
('classifier', LogisticRegression(random_state=42))
])
# Cross-validate pipeline
pipeline_scores = cross_val_score(pipeline, X, y, cv=5)
print(f"Pipeline CV Score: {pipeline_scores.mean():.3f}")
Learning Curves
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
# Generate learning curve
train_sizes, train_scores, val_scores = learning_curve(
model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)
)
# Calculate means and stds
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)
print("Training sizes:", train_sizes)
print("Validation scores:", val_mean.round(3))
Cross-Validation for Regression
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)
# Regression model
reg_model = LinearRegression()
# Custom scoring for regression
def rmse_scorer(estimator, X, y):
y_pred = estimator.predict(X)
return np.sqrt(mean_squared_error(y, y_pred))
# Cross-validation with custom scorer
rmse_scores = cross_val_score(
reg_model, X_reg, y_reg, cv=5,
scoring=make_scorer(rmse_scorer, greater_is_better=False)
)
r2_scores = cross_val_score(reg_model, X_reg, y_reg, cv=5, scoring='r2')
print(f"RMSE: {-rmse_scores.mean():.3f}")
print(f"R²: {r2_scores.mean():.3f}")
Group-Based Cross-Validation
from sklearn.model_selection import GroupKFold
import numpy as np
# Data with groups (e.g., different patients)
groups = np.random.randint(0, 50, 1000) # 50 different groups
# Group K-Fold ensures same group doesn't appear in train and test
group_kfold = GroupKFold(n_splits=5)
group_scores = cross_val_score(model, X, y, cv=group_kfold, groups=groups)
print(f"Group K-Fold CV: {group_scores.mean():.3f}")
Validation Curve
from sklearn.model_selection import validation_curve
# Validation curve for hyperparameter
param_range = [0.001, 0.01, 0.1, 1, 10, 100]
train_scores, val_scores = validation_curve(
SVC(), X[:200], y[:200], param_name='C', param_range=param_range, cv=3
)
print("C values:", param_range)
print("Validation scores:", val_scores.mean(axis=1).round(3))
Best Practices
- Use stratified K-fold for classification
- Use time series split for temporal data
- Nested CV for hyperparameter tuning
- 5-10 folds for most datasets
- Consistent random states for reproducibility
Master Model Selection
Explore hyperparameter optimization, learn model comparison techniques, and discover ensemble methods.
Share this article
Add Comment
No comments yet. Be the first to comment!