Navigation

Python

How to Select Features with SelectKBest

Improve model performance and reduce overfitting by selecting the most informative features using scikit-learn's SelectKBest.

Table Of Contents

Less Can Be More

Too many features can hurt model performance. SelectKBest identifies the most predictive features, improving accuracy while reducing complexity.

Basic SelectKBest Usage

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import make_classification
import numpy as np

# Generate data with irrelevant features
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=5, 
    n_redundant=15, random_state=42
)

print(f"Original features: {X.shape[1]}")

# Select top 5 features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

print(f"Selected features: {X_selected.shape[1]}")
print(f"Selected feature indices: {selector.get_support(indices=True)}")
print(f"Feature scores: {selector.scores_.round(2)}")

Different Scoring Functions

from sklearn.feature_selection import chi2, mutual_info_classif, f_regression

# For classification
scoring_functions = {
    'f_classif': f_classif,
    'chi2': chi2,  # Requires non-negative features
    'mutual_info': mutual_info_classif
}

# Make features non-negative for chi2
X_pos = np.abs(X)

for name, score_func in scoring_functions.items():
    if name == 'chi2':
        selector = SelectKBest(score_func=score_func, k=5)
        X_test = X_pos
    else:
        selector = SelectKBest(score_func=score_func, k=5)
        X_test = X
    
    selector.fit(X_test, y)
    selected_features = selector.get_support(indices=True)
    print(f"{name}: {selected_features}")

SelectKBest for Regression

from sklearn.datasets import make_regression

# Regression data
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=15, n_informative=5, random_state=42
)

# Use f_regression for regression tasks
selector_reg = SelectKBest(score_func=f_regression, k=5)
X_reg_selected = selector_reg.fit_transform(X_reg, y_reg)

print(f"Regression - Original: {X_reg.shape[1]}, Selected: {X_reg_selected.shape[1]}")
print(f"Top features: {selector_reg.get_support(indices=True)}")

SelectPercentile Alternative

from sklearn.feature_selection import SelectPercentile

# Select top 25% of features
selector_pct = SelectPercentile(score_func=f_classif, percentile=25)
X_pct_selected = selector_pct.fit_transform(X, y)

print(f"SelectPercentile (25%): {X_pct_selected.shape[1]} features")
print(f"Selected indices: {selector_pct.get_support(indices=True)}")

Pipeline Integration

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Pipeline with feature selection
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('classifier', LogisticRegression(random_state=42))
])

# Compare with and without feature selection
model_all = LogisticRegression(random_state=42)
scores_all = cross_val_score(model_all, X, y, cv=5)
scores_selected = cross_val_score(pipeline, X, y, cv=5)

print(f"All features: {scores_all.mean():.3f}")
print(f"Selected features: {scores_selected.mean():.3f}")

Finding Optimal K

from sklearn.model_selection import GridSearchCV

# Grid search for optimal k
param_grid = {'selector__k': [3, 5, 7, 10, 15]}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print(f"Best k: {grid_search.best_params_['selector__k']}")
print(f"Best score: {grid_search.best_score_:.3f}")

Custom Scoring Function

from sklearn.metrics import mutual_info_score

def custom_score_func(X, y):
    """Custom scoring function"""
    scores = []
    for i in range(X.shape[1]):
        # Calculate mutual information for each feature
        score = mutual_info_score(X[:, i], y)
        scores.append(score)
    return np.array(scores)

# Use custom scoring
custom_selector = SelectKBest(score_func=custom_score_func, k=5)
X_custom = custom_selector.fit_transform(X, y)

print(f"Custom scoring selected: {custom_selector.get_support(indices=True)}")

Real Dataset Example

import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load real dataset
data = load_breast_cancer()
X_real = data.data
y_real = data.target
feature_names = data.feature_names

# Select best features
selector_real = SelectKBest(f_classif, k=10)
X_real_selected = selector_real.fit_transform(X_real, y_real)

# Get selected feature names
selected_indices = selector_real.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_indices]

print("Top 10 features:")
for i, (idx, score) in enumerate(zip(selected_indices, selector_real.scores_[selected_indices])):
    print(f"{i+1}. {feature_names[idx]}: {score:.2f}")

Variance Threshold

from sklearn.feature_selection import VarianceThreshold

# Remove low-variance features
variance_selector = VarianceThreshold(threshold=0.1)
X_variance = variance_selector.fit_transform(X)

print(f"After variance threshold: {X_variance.shape[1]} features")
print(f"Removed {X.shape[1] - X_variance.shape[1]} low-variance features")

Recursive Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.svm import SVC

# RFE with SVM
estimator = SVC(kernel="linear", C=1)
rfe = RFE(estimator, n_features_to_select=5, step=1)
X_rfe = rfe.fit_transform(X, y)

print(f"RFE selected features: {rfe.get_support(indices=True)}")
print(f"Feature ranking: {rfe.ranking_[:10]}")  # Show first 10

Feature Importance Visualization

import matplotlib.pyplot as plt

# Get feature scores
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)

# Sort features by score
feature_scores = selector.scores_
sorted_indices = np.argsort(feature_scores)[::-1]

print("Feature importance ranking:")
for i in range(10):
    idx = sorted_indices[i]
    print(f"Feature {idx}: {feature_scores[idx]:.2f}")

Best Practices

  • Use domain knowledge to guide feature selection
  • Try different k values with cross-validation
  • Combine multiple methods (variance threshold + SelectKBest)
  • Consider computational cost vs. performance gain
  • Always validate on unseen data

Master Feature Engineering

Explore advanced feature selection methods, learn dimensionality reduction techniques, and discover automated feature engineering.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python