Table Of Contents
- Finding the Sweet Spot
- Basic GridSearchCV Usage
- Random Forest Hyperparameter Tuning
- Multiple Scoring Metrics
- Pipeline Hyperparameter Tuning
- RandomizedSearchCV Alternative
- Nested Cross-Validation
- Regression Hyperparameter Tuning
- Advanced Parameter Grids
- Hyperparameter Tuning Best Practices
- Model Comparison with Tuning
- Optimization Tips
- Master Model Optimization
Finding the Sweet Spot
Default hyperparameters rarely give optimal performance. GridSearchCV automates the search for the best hyperparameter combinations through cross-validation.
Basic GridSearchCV Usage
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Generate sample data
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define parameter grid
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1],
'kernel': ['rbf', 'linear']
}
# Create and fit GridSearchCV
svm = SVC(random_state=42)
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
print(f"Test set score: {grid_search.score(X_test, y_test):.3f}")
Random Forest Hyperparameter Tuning
from sklearn.ensemble import RandomForestClassifier
# Random Forest parameter grid
rf_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print(f"RF Best parameters: {rf_grid.best_params_}")
print(f"RF Best CV score: {rf_grid.best_score_:.3f}")
Multiple Scoring Metrics
from sklearn.metrics import make_scorer, precision_score, recall_score
# Define multiple scoring metrics
scoring = {
'accuracy': 'accuracy',
'precision': make_scorer(precision_score, average='weighted'),
'recall': make_scorer(recall_score, average='weighted'),
'f1': 'f1_weighted'
}
# Grid search with multiple metrics
multi_grid = GridSearchCV(
svm, param_grid, cv=5, scoring=scoring,
refit='f1', n_jobs=-1 # Refit on F1 score
)
multi_grid.fit(X_train, y_train)
print(f"Best params (F1): {multi_grid.best_params_}")
print(f"Best F1 score: {multi_grid.best_score_:.3f}")
# Access all scores
results = multi_grid.cv_results_
print(f"Mean test accuracy: {results['mean_test_accuracy'][:3].round(3)}")
Pipeline Hyperparameter Tuning
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('selector', SelectKBest(f_classif)),
('classifier', SVC(random_state=42))
])
# Pipeline parameter grid (use step__parameter format)
pipeline_params = {
'selector__k': [5, 10, 15],
'classifier__C': [0.1, 1, 10],
'classifier__gamma': [0.01, 0.1, 1],
'classifier__kernel': ['rbf', 'linear']
}
pipeline_grid = GridSearchCV(pipeline, pipeline_params, cv=3, n_jobs=-1)
pipeline_grid.fit(X_train, y_train)
print(f"Pipeline best params: {pipeline_grid.best_params_}")
print(f"Pipeline best score: {pipeline_grid.best_score_:.3f}")
RandomizedSearchCV Alternative
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
# Random parameter distributions
random_param_dist = {
'C': uniform(0.1, 100), # Continuous distribution
'gamma': uniform(0.001, 1),
'kernel': ['rbf', 'linear']
}
# Randomized search
random_search = RandomizedSearchCV(
svm, random_param_dist, n_iter=20, cv=5,
scoring='accuracy', random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)
print(f"Random search best params: {random_search.best_params_}")
print(f"Random search best score: {random_search.best_score_:.3f}")
Nested Cross-Validation
from sklearn.model_selection import cross_val_score
# Nested CV for unbiased performance estimate
def nested_cv_score(estimator, param_grid, X, y, outer_cv=5, inner_cv=3):
"""Perform nested cross-validation"""
# Inner loop: hyperparameter tuning
grid_search = GridSearchCV(estimator, param_grid, cv=inner_cv, scoring='accuracy')
# Outer loop: performance estimation
nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv)
return nested_scores
# Apply nested CV
nested_scores = nested_cv_score(svm, param_grid, X_train, y_train)
print(f"Nested CV scores: {nested_scores.round(3)}")
print(f"Nested CV mean: {nested_scores.mean():.3f} (+/- {nested_scores.std() * 2:.3f})")
Regression Hyperparameter Tuning
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# Regression parameter grid
reg_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10],
'min_samples_split': [2, 5]
}
# Grid search for regression
rf_reg = RandomForestRegressor(random_state=42)
reg_grid = GridSearchCV(
rf_reg, reg_param_grid, cv=5,
scoring='neg_mean_squared_error', n_jobs=-1
)
reg_grid.fit(X_reg_train, y_reg_train)
print(f"Regression best params: {reg_grid.best_params_}")
print(f"Best CV MSE: {-reg_grid.best_score_:.3f}")
Advanced Parameter Grids
# Different parameter combinations
advanced_param_grid = [
# RBF kernel parameters
{
'kernel': ['rbf'],
'C': [0.1, 1, 10],
'gamma': [0.01, 0.1, 1]
},
# Linear kernel parameters
{
'kernel': ['linear'],
'C': [0.1, 1, 10]
},
# Polynomial kernel parameters
{
'kernel': ['poly'],
'C': [0.1, 1, 10],
'degree': [2, 3, 4],
'gamma': [0.01, 0.1]
}
]
advanced_grid = GridSearchCV(svm, advanced_param_grid, cv=3, n_jobs=-1)
advanced_grid.fit(X_train, y_train)
print(f"Advanced grid best params: {advanced_grid.best_params_}")
Hyperparameter Tuning Best Practices
import time
def efficient_grid_search(X, y):
"""Demonstrate efficient grid search practices"""
# Start with coarse grid
coarse_grid = {
'C': [0.01, 1, 100],
'gamma': [0.001, 0.1, 10]
}
# Coarse search
start_time = time.time()
coarse_search = GridSearchCV(svm, coarse_grid, cv=3, n_jobs=-1)
coarse_search.fit(X, y)
coarse_time = time.time() - start_time
# Fine-tune around best parameters
best_C = coarse_search.best_params_['C']
best_gamma = coarse_search.best_params_['gamma']
fine_grid = {
'C': [best_C * 0.1, best_C, best_C * 10],
'gamma': [best_gamma * 0.1, best_gamma, best_gamma * 10]
}
# Fine search
start_time = time.time()
fine_search = GridSearchCV(svm, fine_grid, cv=5, n_jobs=-1)
fine_search.fit(X, y)
fine_time = time.time() - start_time
print(f"Coarse search time: {coarse_time:.2f}s")
print(f"Fine search time: {fine_time:.2f}s")
print(f"Final best params: {fine_search.best_params_}")
return fine_search
# Apply efficient search
efficient_model = efficient_grid_search(X_train, y_train)
Model Comparison with Tuning
from sklearn.linear_model import LogisticRegression
# Compare multiple models with tuning
models = {
'SVM': (SVC(random_state=42), {
'C': [0.1, 1, 10],
'gamma': [0.01, 0.1, 1],
'kernel': ['rbf', 'linear']
}),
'Random Forest': (RandomForestClassifier(random_state=42), {
'n_estimators': [50, 100],
'max_depth': [3, 5, None],
'min_samples_split': [2, 5]
}),
'Logistic Regression': (LogisticRegression(random_state=42, max_iter=1000), {
'C': [0.1, 1, 10],
'penalty': ['l1', 'l2'],
'solver': ['liblinear']
})
}
# Tune and compare all models
results = {}
for name, (model, params) in models.items():
grid = GridSearchCV(model, params, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
results[name] = {
'best_score': grid.best_score_,
'best_params': grid.best_params_,
'test_score': grid.score(X_test, y_test)
}
# Display results
for name, result in results.items():
print(f"\n{name}:")
print(f" Best CV score: {result['best_score']:.3f}")
print(f" Test score: {result['test_score']:.3f}")
print(f" Best params: {result['best_params']}")
Optimization Tips
- Start with coarse grids, then refine
- Use RandomizedSearchCV for large parameter spaces
- Limit CV folds for initial exploration (3-5 folds)
- Use n_jobs=-1 for parallel processing
- Consider early stopping for iterative algorithms
Master Model Optimization
Explore automated hyperparameter optimization, learn Bayesian optimization techniques, and discover advanced model selection strategies.
Share this article
Add Comment
No comments yet. Be the first to comment!