How to Select Features with SelectKBest

Less Can Be More
Basic SelectKBest Usage
Different Scoring Functions
SelectKBest for Regression
SelectPercentile Alternative
Pipeline Integration
Finding Optimal K
Custom Scoring Function
Real Dataset Example
Variance Threshold
Recursive Feature Elimination
Feature Importance Visualization
Best Practices
Master Feature Engineering

Less Can Be More

Too many features can hurt model performance. SelectKBest identifies the most predictive features, improving accuracy while reducing complexity.

Basic SelectKBest Usage

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import make_classification
import numpy as np

# Generate data with irrelevant features
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=5, 
    n_redundant=15, random_state=42
)

print(f"Original features: {X.shape[1]}")

# Select top 5 features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

print(f"Selected features: {X_selected.shape[1]}")
print(f"Selected feature indices: {selector.get_support(indices=True)}")
print(f"Feature scores: {selector.scores_.round(2)}")

Different Scoring Functions

from sklearn.feature_selection import chi2, mutual_info_classif, f_regression

# For classification
scoring_functions = {
    'f_classif': f_classif,
    'chi2': chi2,  # Requires non-negative features
    'mutual_info': mutual_info_classif
}

# Make features non-negative for chi2
X_pos = np.abs(X)

for name, score_func in scoring_functions.items():
    if name == 'chi2':
        selector = SelectKBest(score_func=score_func, k=5)
        X_test = X_pos
    else:
        selector = SelectKBest(score_func=score_func, k=5)
        X_test = X
    
    selector.fit(X_test, y)
    selected_features = selector.get_support(indices=True)
    print(f"{name}: {selected_features}")

SelectKBest for Regression

from sklearn.datasets import make_regression

# Regression data
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=15, n_informative=5, random_state=42
)

# Use f_regression for regression tasks
selector_reg = SelectKBest(score_func=f_regression, k=5)
X_reg_selected = selector_reg.fit_transform(X_reg, y_reg)

print(f"Regression - Original: {X_reg.shape[1]}, Selected: {X_reg_selected.shape[1]}")
print(f"Top features: {selector_reg.get_support(indices=True)}")

SelectPercentile Alternative

from sklearn.feature_selection import SelectPercentile

# Select top 25% of features
selector_pct = SelectPercentile(score_func=f_classif, percentile=25)
X_pct_selected = selector_pct.fit_transform(X, y)

print(f"SelectPercentile (25%): {X_pct_selected.shape[1]} features")
print(f"Selected indices: {selector_pct.get_support(indices=True)}")

Pipeline Integration

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Pipeline with feature selection
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('classifier', LogisticRegression(random_state=42))
])

# Compare with and without feature selection
model_all = LogisticRegression(random_state=42)
scores_all = cross_val_score(model_all, X, y, cv=5)
scores_selected = cross_val_score(pipeline, X, y, cv=5)

print(f"All features: {scores_all.mean():.3f}")
print(f"Selected features: {scores_selected.mean():.3f}")

Finding Optimal K

from sklearn.model_selection import GridSearchCV

# Grid search for optimal k
param_grid = {'selector__k': [3, 5, 7, 10, 15]}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print(f"Best k: {grid_search.best_params_['selector__k']}")
print(f"Best score: {grid_search.best_score_:.3f}")

Custom Scoring Function

from sklearn.metrics import mutual_info_score

def custom_score_func(X, y):
    """Custom scoring function"""
    scores = []
    for i in range(X.shape[1]):
        # Calculate mutual information for each feature
        score = mutual_info_score(X[:, i], y)
        scores.append(score)
    return np.array(scores)

# Use custom scoring
custom_selector = SelectKBest(score_func=custom_score_func, k=5)
X_custom = custom_selector.fit_transform(X, y)

print(f"Custom scoring selected: {custom_selector.get_support(indices=True)}")

Real Dataset Example

import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load real dataset
data = load_breast_cancer()
X_real = data.data
y_real = data.target
feature_names = data.feature_names

# Select best features
selector_real = SelectKBest(f_classif, k=10)
X_real_selected = selector_real.fit_transform(X_real, y_real)

# Get selected feature names
selected_indices = selector_real.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_indices]

print("Top 10 features:")
for i, (idx, score) in enumerate(zip(selected_indices, selector_real.scores_[selected_indices])):
    print(f"{i+1}. {feature_names[idx]}: {score:.2f}")

Variance Threshold

from sklearn.feature_selection import VarianceThreshold

# Remove low-variance features
variance_selector = VarianceThreshold(threshold=0.1)
X_variance = variance_selector.fit_transform(X)

print(f"After variance threshold: {X_variance.shape[1]} features")
print(f"Removed {X.shape[1] - X_variance.shape[1]} low-variance features")

Recursive Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.svm import SVC

# RFE with SVM
estimator = SVC(kernel="linear", C=1)
rfe = RFE(estimator, n_features_to_select=5, step=1)
X_rfe = rfe.fit_transform(X, y)

print(f"RFE selected features: {rfe.get_support(indices=True)}")
print(f"Feature ranking: {rfe.ranking_[:10]}")  # Show first 10

Feature Importance Visualization

import matplotlib.pyplot as plt

# Get feature scores
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)

# Sort features by score
feature_scores = selector.scores_
sorted_indices = np.argsort(feature_scores)[::-1]

print("Feature importance ranking:")
for i in range(10):
    idx = sorted_indices[i]
    print(f"Feature {idx}: {feature_scores[idx]:.2f}")

Best Practices

Use domain knowledge to guide feature selection
Try different k values with cross-validation
Combine multiple methods (variance threshold + SelectKBest)
Consider computational cost vs. performance gain
Always validate on unseen data

Master Feature Engineering

Explore advanced feature selection methods, learn dimensionality reduction techniques, and discover automated feature engineering.

Share this article

Navigation

How to Select Features with SelectKBest

Table Of Contents

Less Can Be More

Basic SelectKBest Usage

Different Scoring Functions

SelectKBest for Regression

SelectPercentile Alternative

Pipeline Integration

Finding Optimal K

Custom Scoring Function

Real Dataset Example

Variance Threshold

Recursive Feature Elimination

Feature Importance Visualization

Best Practices

Master Feature Engineering

Add Comment

More from Python

Navigation

Table Of Contents

Less Can Be More

Basic SelectKBest Usage

Different Scoring Functions

SelectKBest for Regression

SelectPercentile Alternative

Pipeline Integration

Finding Optimal K

Custom Scoring Function

Real Dataset Example

Variance Threshold

Recursive Feature Elimination

Feature Importance Visualization

Best Practices

Master Feature Engineering

Comments

Add Comment

More from Python

How to Handle Python's GIL (Global Interpreter Lock)

Python Keyword-Only and Positional-Only Arguments: Advanced Function Signatures

How to Handle Python's import System

How to Handle Missing Values (NaN) in NumPy

Python JSON Module: Complete Guide to JSON Manipulation in 2025

Python Function Factories and Parameterized Decorators: Advanced Design Patterns