Table Of Contents
- Beyond Accuracy
- Classification Metrics
- Confusion Matrix Analysis
- Multi-class Classification Metrics
- ROC and Precision-Recall Curves
- Regression Metrics
- Custom Metrics
- Model Comparison Framework
- Cross-Validation Metrics
- Business-Specific Metrics
- Metric Selection Guidelines
- Metrics Best Practices
- Master Model Evaluation
Beyond Accuracy
Accuracy alone doesn't tell the full story. Master comprehensive evaluation metrics to truly understand your model's strengths and weaknesses.
Classification Metrics
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_auc_score
)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
# Generate sample data
X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
# Basic metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.3f}")
Confusion Matrix Analysis
import matplotlib.pyplot as plt
import seaborn as sns
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# Calculate metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")
# Derived metrics
sensitivity = tp / (tp + fn) # Recall
specificity = tn / (tn + fp)
precision = tp / (tp + fp)
print(f"\nSensitivity (Recall): {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"Precision: {precision:.3f}")
Multi-class Classification Metrics
from sklearn.datasets import make_classification
# Multi-class data
X_multi, y_multi = make_classification(
n_samples=1000, n_classes=3, n_informative=10, random_state=42
)
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
X_multi, y_multi, test_size=0.2, random_state=42
)
# Train multi-class model
multi_model = LogisticRegression(random_state=42, max_iter=1000)
multi_model.fit(X_train_multi, y_train_multi)
y_pred_multi = multi_model.predict(X_test_multi)
# Multi-class metrics with different averaging
print("Multi-class Metrics:")
print(f"Accuracy: {accuracy_score(y_test_multi, y_pred_multi):.3f}")
print(f"Precision (macro): {precision_score(y_test_multi, y_pred_multi, average='macro'):.3f}")
print(f"Precision (weighted): {precision_score(y_test_multi, y_pred_multi, average='weighted'):.3f}")
print(f"F1 (macro): {f1_score(y_test_multi, y_pred_multi, average='macro'):.3f}")
# Classification report
print("\nClassification Report:")
print(classification_report(y_test_multi, y_pred_multi))
ROC and Precision-Recall Curves
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC: {roc_auc:.3f}")
# Precision-Recall Curve
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_prob)
avg_precision = average_precision_score(y_test, y_prob)
print(f"Average Precision: {avg_precision:.3f}")
# Find optimal threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold_idx = np.argmax(f1_scores)
best_threshold = pr_thresholds[best_threshold_idx]
print(f"Best threshold: {best_threshold:.3f}")
print(f"Best F1 score: {f1_scores[best_threshold_idx]:.3f}")
Regression Metrics
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error
)
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# Train regression model
reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)
# Regression metrics
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
mape = mean_absolute_percentage_error(y_test_reg, y_pred_reg)
print("Regression Metrics:")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"MAPE: {mape:.3f}")
Custom Metrics
from sklearn.metrics import make_scorer
def custom_accuracy_score(y_true, y_pred):
"""Custom metric: penalize false positives more"""
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
# Custom scoring: penalize FP twice as much as FN
penalty = fn + 2 * fp
total = len(y_true)
return 1 - (penalty / total)
# Use custom metric
custom_score = custom_accuracy_score(y_test, y_pred)
print(f"Custom score: {custom_score:.3f}")
# Create scorer for cross-validation
custom_scorer = make_scorer(custom_accuracy_score)
Model Comparison Framework
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
"""Comprehensive model evaluation"""
# Fit model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Get probabilities if available
if hasattr(model, "predict_proba"):
y_prob = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_prob)
else:
auc_score = None
# Calculate metrics
metrics = {
'Model': model_name,
'Accuracy': accuracy_score(y_test, y_pred),
'Precision': precision_score(y_test, y_pred),
'Recall': recall_score(y_test, y_pred),
'F1': f1_score(y_test, y_pred),
'AUC': auc_score
}
return metrics
# Compare multiple models
models = {
'Logistic Regression': LogisticRegression(random_state=42),
'Random Forest': RandomForestClassifier(random_state=42),
'SVM': SVC(probability=True, random_state=42)
}
results = []
for name, model in models.items():
result = evaluate_model(model, X_train, X_test, y_train, y_test, name)
results.append(result)
# Display results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.round(3))
Cross-Validation Metrics
from sklearn.model_selection import cross_validate
# Multiple metrics with cross-validation
scoring = {
'accuracy': 'accuracy',
'precision': 'precision',
'recall': 'recall',
'f1': 'f1',
'roc_auc': 'roc_auc'
}
cv_results = cross_validate(
LogisticRegression(random_state=42),
X_train, y_train,
cv=5,
scoring=scoring
)
print("Cross-Validation Results:")
for metric in scoring:
scores = cv_results[f'test_{metric}']
print(f"{metric.title()}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
Business-Specific Metrics
def calculate_business_metrics(y_true, y_pred, cost_fp=10, cost_fn=50, revenue_tp=100):
"""Calculate business-specific metrics"""
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
# Business costs and revenues
total_cost = fp * cost_fp + fn * cost_fn
total_revenue = tp * revenue_tp
net_profit = total_revenue - total_cost
# Metrics
cost_per_prediction = total_cost / len(y_true)
profit_per_customer = net_profit / tp if tp > 0 else 0
return {
'Total Cost': total_cost,
'Total Revenue': total_revenue,
'Net Profit': net_profit,
'Cost per Prediction': cost_per_prediction,
'Profit per TP': profit_per_customer
}
# Business evaluation
business_metrics = calculate_business_metrics(y_test, y_pred)
print("\nBusiness Metrics:")
for metric, value in business_metrics.items():
print(f"{metric}: ${value:.2f}")
Metric Selection Guidelines
def recommend_metrics(problem_type, class_distribution=None):
"""Recommend appropriate metrics based on problem characteristics"""
recommendations = {
'balanced_classification': [
'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc'
],
'imbalanced_classification': [
'precision', 'recall', 'f1_score', 'precision_recall_auc',
'balanced_accuracy'
],
'multi_class': [
'accuracy', 'macro_f1', 'weighted_f1', 'confusion_matrix'
],
'regression': [
'mse', 'rmse', 'mae', 'r2_score', 'mape'
],
'ranking': [
'ndcg', 'map', 'precision_at_k'
]
}
return recommendations.get(problem_type, ['accuracy'])
# Get recommendations
imbalanced_metrics = recommend_metrics('imbalanced_classification')
print("Recommended metrics for imbalanced data:")
print(imbalanced_metrics)
Metrics Best Practices
- Classification: Use precision/recall for imbalanced data
- Regression: RMSE for outlier sensitivity, MAE for robustness
- Multi-class: Macro averaging for equal class importance
- Business: Define custom metrics aligned with business goals
- Always: Use cross-validation for reliable estimates
Master Model Evaluation
Explore statistical significance testing, learn A/B testing for models, and discover production monitoring metrics.
Share this article
Add Comment
No comments yet. Be the first to comment!