Table Of Contents
- Less Can Be More
- Basic SelectKBest Usage
- Different Scoring Functions
- SelectKBest for Regression
- SelectPercentile Alternative
- Pipeline Integration
- Finding Optimal K
- Custom Scoring Function
- Real Dataset Example
- Variance Threshold
- Recursive Feature Elimination
- Feature Importance Visualization
- Best Practices
- Master Feature Engineering
Less Can Be More
Too many features can hurt model performance. SelectKBest identifies the most predictive features, improving accuracy while reducing complexity.
Basic SelectKBest Usage
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import make_classification
import numpy as np
# Generate data with irrelevant features
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=5,
n_redundant=15, random_state=42
)
print(f"Original features: {X.shape[1]}")
# Select top 5 features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)
print(f"Selected features: {X_selected.shape[1]}")
print(f"Selected feature indices: {selector.get_support(indices=True)}")
print(f"Feature scores: {selector.scores_.round(2)}")
Different Scoring Functions
from sklearn.feature_selection import chi2, mutual_info_classif, f_regression
# For classification
scoring_functions = {
'f_classif': f_classif,
'chi2': chi2, # Requires non-negative features
'mutual_info': mutual_info_classif
}
# Make features non-negative for chi2
X_pos = np.abs(X)
for name, score_func in scoring_functions.items():
if name == 'chi2':
selector = SelectKBest(score_func=score_func, k=5)
X_test = X_pos
else:
selector = SelectKBest(score_func=score_func, k=5)
X_test = X
selector.fit(X_test, y)
selected_features = selector.get_support(indices=True)
print(f"{name}: {selected_features}")
SelectKBest for Regression
from sklearn.datasets import make_regression
# Regression data
X_reg, y_reg = make_regression(
n_samples=1000, n_features=15, n_informative=5, random_state=42
)
# Use f_regression for regression tasks
selector_reg = SelectKBest(score_func=f_regression, k=5)
X_reg_selected = selector_reg.fit_transform(X_reg, y_reg)
print(f"Regression - Original: {X_reg.shape[1]}, Selected: {X_reg_selected.shape[1]}")
print(f"Top features: {selector_reg.get_support(indices=True)}")
SelectPercentile Alternative
from sklearn.feature_selection import SelectPercentile
# Select top 25% of features
selector_pct = SelectPercentile(score_func=f_classif, percentile=25)
X_pct_selected = selector_pct.fit_transform(X, y)
print(f"SelectPercentile (25%): {X_pct_selected.shape[1]} features")
print(f"Selected indices: {selector_pct.get_support(indices=True)}")
Pipeline Integration
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Pipeline with feature selection
pipeline = Pipeline([
('selector', SelectKBest(f_classif, k=10)),
('classifier', LogisticRegression(random_state=42))
])
# Compare with and without feature selection
model_all = LogisticRegression(random_state=42)
scores_all = cross_val_score(model_all, X, y, cv=5)
scores_selected = cross_val_score(pipeline, X, y, cv=5)
print(f"All features: {scores_all.mean():.3f}")
print(f"Selected features: {scores_selected.mean():.3f}")
Finding Optimal K
from sklearn.model_selection import GridSearchCV
# Grid search for optimal k
param_grid = {'selector__k': [3, 5, 7, 10, 15]}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)
print(f"Best k: {grid_search.best_params_['selector__k']}")
print(f"Best score: {grid_search.best_score_:.3f}")
Custom Scoring Function
from sklearn.metrics import mutual_info_score
def custom_score_func(X, y):
"""Custom scoring function"""
scores = []
for i in range(X.shape[1]):
# Calculate mutual information for each feature
score = mutual_info_score(X[:, i], y)
scores.append(score)
return np.array(scores)
# Use custom scoring
custom_selector = SelectKBest(score_func=custom_score_func, k=5)
X_custom = custom_selector.fit_transform(X, y)
print(f"Custom scoring selected: {custom_selector.get_support(indices=True)}")
Real Dataset Example
import pandas as pd
from sklearn.datasets import load_breast_cancer
# Load real dataset
data = load_breast_cancer()
X_real = data.data
y_real = data.target
feature_names = data.feature_names
# Select best features
selector_real = SelectKBest(f_classif, k=10)
X_real_selected = selector_real.fit_transform(X_real, y_real)
# Get selected feature names
selected_indices = selector_real.get_support(indices=True)
selected_features = [feature_names[i] for i in selected_indices]
print("Top 10 features:")
for i, (idx, score) in enumerate(zip(selected_indices, selector_real.scores_[selected_indices])):
print(f"{i+1}. {feature_names[idx]}: {score:.2f}")
Variance Threshold
from sklearn.feature_selection import VarianceThreshold
# Remove low-variance features
variance_selector = VarianceThreshold(threshold=0.1)
X_variance = variance_selector.fit_transform(X)
print(f"After variance threshold: {X_variance.shape[1]} features")
print(f"Removed {X.shape[1] - X_variance.shape[1]} low-variance features")
Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
# RFE with SVM
estimator = SVC(kernel="linear", C=1)
rfe = RFE(estimator, n_features_to_select=5, step=1)
X_rfe = rfe.fit_transform(X, y)
print(f"RFE selected features: {rfe.get_support(indices=True)}")
print(f"Feature ranking: {rfe.ranking_[:10]}") # Show first 10
Feature Importance Visualization
import matplotlib.pyplot as plt
# Get feature scores
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
# Sort features by score
feature_scores = selector.scores_
sorted_indices = np.argsort(feature_scores)[::-1]
print("Feature importance ranking:")
for i in range(10):
idx = sorted_indices[i]
print(f"Feature {idx}: {feature_scores[idx]:.2f}")
Best Practices
- Use domain knowledge to guide feature selection
- Try different k values with cross-validation
- Combine multiple methods (variance threshold + SelectKBest)
- Consider computational cost vs. performance gain
- Always validate on unseen data
Master Feature Engineering
Explore advanced feature selection methods, learn dimensionality reduction techniques, and discover automated feature engineering.
Share this article
Add Comment
No comments yet. Be the first to comment!