Navigation

Python

How to Tune Hyperparameters with GridSearchCV

Optimize machine learning model performance by systematically searching hyperparameter combinations with scikit-learn's GridSearchCV.

Table Of Contents

Finding the Sweet Spot

Default hyperparameters rarely give optimal performance. GridSearchCV automates the search for the best hyperparameter combinations through cross-validation.

Basic GridSearchCV Usage

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# Create and fit GridSearchCV
svm = SVC(random_state=42)
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
print(f"Test set score: {grid_search.score(X_test, y_test):.3f}")

Random Forest Hyperparameter Tuning

from sklearn.ensemble import RandomForestClassifier

# Random Forest parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)

print(f"RF Best parameters: {rf_grid.best_params_}")
print(f"RF Best CV score: {rf_grid.best_score_:.3f}")

Multiple Scoring Metrics

from sklearn.metrics import make_scorer, precision_score, recall_score

# Define multiple scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': 'f1_weighted'
}

# Grid search with multiple metrics
multi_grid = GridSearchCV(
    svm, param_grid, cv=5, scoring=scoring, 
    refit='f1', n_jobs=-1  # Refit on F1 score
)
multi_grid.fit(X_train, y_train)

print(f"Best params (F1): {multi_grid.best_params_}")
print(f"Best F1 score: {multi_grid.best_score_:.3f}")

# Access all scores
results = multi_grid.cv_results_
print(f"Mean test accuracy: {results['mean_test_accuracy'][:3].round(3)}")

Pipeline Hyperparameter Tuning

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_classif)),
    ('classifier', SVC(random_state=42))
])

# Pipeline parameter grid (use step__parameter format)
pipeline_params = {
    'selector__k': [5, 10, 15],
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': [0.01, 0.1, 1],
    'classifier__kernel': ['rbf', 'linear']
}

pipeline_grid = GridSearchCV(pipeline, pipeline_params, cv=3, n_jobs=-1)
pipeline_grid.fit(X_train, y_train)

print(f"Pipeline best params: {pipeline_grid.best_params_}")
print(f"Pipeline best score: {pipeline_grid.best_score_:.3f}")

RandomizedSearchCV Alternative

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Random parameter distributions
random_param_dist = {
    'C': uniform(0.1, 100),  # Continuous distribution
    'gamma': uniform(0.001, 1),
    'kernel': ['rbf', 'linear']
}

# Randomized search
random_search = RandomizedSearchCV(
    svm, random_param_dist, n_iter=20, cv=5, 
    scoring='accuracy', random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)

print(f"Random search best params: {random_search.best_params_}")
print(f"Random search best score: {random_search.best_score_:.3f}")

Nested Cross-Validation

from sklearn.model_selection import cross_val_score

# Nested CV for unbiased performance estimate
def nested_cv_score(estimator, param_grid, X, y, outer_cv=5, inner_cv=3):
    """Perform nested cross-validation"""
    
    # Inner loop: hyperparameter tuning
    grid_search = GridSearchCV(estimator, param_grid, cv=inner_cv, scoring='accuracy')
    
    # Outer loop: performance estimation
    nested_scores = cross_val_score(grid_search, X, y, cv=outer_cv)
    
    return nested_scores

# Apply nested CV
nested_scores = nested_cv_score(svm, param_grid, X_train, y_train)
print(f"Nested CV scores: {nested_scores.round(3)}")
print(f"Nested CV mean: {nested_scores.mean():.3f} (+/- {nested_scores.std() * 2:.3f})")

Regression Hyperparameter Tuning

from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Regression parameter grid
reg_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5]
}

# Grid search for regression
rf_reg = RandomForestRegressor(random_state=42)
reg_grid = GridSearchCV(
    rf_reg, reg_param_grid, cv=5, 
    scoring='neg_mean_squared_error', n_jobs=-1
)
reg_grid.fit(X_reg_train, y_reg_train)

print(f"Regression best params: {reg_grid.best_params_}")
print(f"Best CV MSE: {-reg_grid.best_score_:.3f}")

Advanced Parameter Grids

# Different parameter combinations
advanced_param_grid = [
    # RBF kernel parameters
    {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10],
        'gamma': [0.01, 0.1, 1]
    },
    # Linear kernel parameters
    {
        'kernel': ['linear'],
        'C': [0.1, 1, 10]
    },
    # Polynomial kernel parameters
    {
        'kernel': ['poly'],
        'C': [0.1, 1, 10],
        'degree': [2, 3, 4],
        'gamma': [0.01, 0.1]
    }
]

advanced_grid = GridSearchCV(svm, advanced_param_grid, cv=3, n_jobs=-1)
advanced_grid.fit(X_train, y_train)

print(f"Advanced grid best params: {advanced_grid.best_params_}")

Hyperparameter Tuning Best Practices

import time

def efficient_grid_search(X, y):
    """Demonstrate efficient grid search practices"""
    
    # Start with coarse grid
    coarse_grid = {
        'C': [0.01, 1, 100],
        'gamma': [0.001, 0.1, 10]
    }
    
    # Coarse search
    start_time = time.time()
    coarse_search = GridSearchCV(svm, coarse_grid, cv=3, n_jobs=-1)
    coarse_search.fit(X, y)
    coarse_time = time.time() - start_time
    
    # Fine-tune around best parameters
    best_C = coarse_search.best_params_['C']
    best_gamma = coarse_search.best_params_['gamma']
    
    fine_grid = {
        'C': [best_C * 0.1, best_C, best_C * 10],
        'gamma': [best_gamma * 0.1, best_gamma, best_gamma * 10]
    }
    
    # Fine search
    start_time = time.time()
    fine_search = GridSearchCV(svm, fine_grid, cv=5, n_jobs=-1)
    fine_search.fit(X, y)
    fine_time = time.time() - start_time
    
    print(f"Coarse search time: {coarse_time:.2f}s")
    print(f"Fine search time: {fine_time:.2f}s")
    print(f"Final best params: {fine_search.best_params_}")
    
    return fine_search

# Apply efficient search
efficient_model = efficient_grid_search(X_train, y_train)

Model Comparison with Tuning

from sklearn.linear_model import LogisticRegression

# Compare multiple models with tuning
models = {
    'SVM': (SVC(random_state=42), {
        'C': [0.1, 1, 10],
        'gamma': [0.01, 0.1, 1],
        'kernel': ['rbf', 'linear']
    }),
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, None],
        'min_samples_split': [2, 5]
    }),
    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=1000), {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    })
}

# Tune and compare all models
results = {}
for name, (model, params) in models.items():
    grid = GridSearchCV(model, params, cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    results[name] = {
        'best_score': grid.best_score_,
        'best_params': grid.best_params_,
        'test_score': grid.score(X_test, y_test)
    }

# Display results
for name, result in results.items():
    print(f"\n{name}:")
    print(f"  Best CV score: {result['best_score']:.3f}")
    print(f"  Test score: {result['test_score']:.3f}")
    print(f"  Best params: {result['best_params']}")

Optimization Tips

  • Start with coarse grids, then refine
  • Use RandomizedSearchCV for large parameter spaces
  • Limit CV folds for initial exploration (3-5 folds)
  • Use n_jobs=-1 for parallel processing
  • Consider early stopping for iterative algorithms

Master Model Optimization

Explore automated hyperparameter optimization, learn Bayesian optimization techniques, and discover advanced model selection strategies.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python