Navigation

Python

How to Evaluate Model Performance with Metrics

Measure machine learning model effectiveness using scikit-learn's comprehensive evaluation metrics for classification, regression, and clustering.

Table Of Contents

Beyond Accuracy

Accuracy alone doesn't tell the full story. Master comprehensive evaluation metrics to truly understand your model's strengths and weaknesses.

Classification Metrics

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

# Generate sample data
X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Basic metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.3f}")

Confusion Matrix Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Calculate metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

# Derived metrics
sensitivity = tp / (tp + fn)  # Recall
specificity = tn / (tn + fp)
precision = tp / (tp + fp)

print(f"\nSensitivity (Recall): {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"Precision: {precision:.3f}")

Multi-class Classification Metrics

from sklearn.datasets import make_classification

# Multi-class data
X_multi, y_multi = make_classification(
    n_samples=1000, n_classes=3, n_informative=10, random_state=42
)
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Train multi-class model
multi_model = LogisticRegression(random_state=42, max_iter=1000)
multi_model.fit(X_train_multi, y_train_multi)
y_pred_multi = multi_model.predict(X_test_multi)

# Multi-class metrics with different averaging
print("Multi-class Metrics:")
print(f"Accuracy: {accuracy_score(y_test_multi, y_pred_multi):.3f}")
print(f"Precision (macro): {precision_score(y_test_multi, y_pred_multi, average='macro'):.3f}")
print(f"Precision (weighted): {precision_score(y_test_multi, y_pred_multi, average='weighted'):.3f}")
print(f"F1 (macro): {f1_score(y_test_multi, y_pred_multi, average='macro'):.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_multi, y_pred_multi))

ROC and Precision-Recall Curves

from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"ROC AUC: {roc_auc:.3f}")

# Precision-Recall Curve
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_prob)
avg_precision = average_precision_score(y_test, y_prob)

print(f"Average Precision: {avg_precision:.3f}")

# Find optimal threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold_idx = np.argmax(f1_scores)
best_threshold = pr_thresholds[best_threshold_idx]

print(f"Best threshold: {best_threshold:.3f}")
print(f"Best F1 score: {f1_scores[best_threshold_idx]:.3f}")

Regression Metrics

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error
)
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train regression model
reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)

# Regression metrics
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
mape = mean_absolute_percentage_error(y_test_reg, y_pred_reg)

print("Regression Metrics:")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"MAPE: {mape:.3f}")

Custom Metrics

from sklearn.metrics import make_scorer

def custom_accuracy_score(y_true, y_pred):
    """Custom metric: penalize false positives more"""
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Custom scoring: penalize FP twice as much as FN
    penalty = fn + 2 * fp
    total = len(y_true)
    
    return 1 - (penalty / total)

# Use custom metric
custom_score = custom_accuracy_score(y_test, y_pred)
print(f"Custom score: {custom_score:.3f}")

# Create scorer for cross-validation
custom_scorer = make_scorer(custom_accuracy_score)

Model Comparison Framework

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    
    # Fit model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Get probabilities if available
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_prob)
    else:
        auc_score = None
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': auc_score
    }
    
    return metrics

# Compare multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

results = []
for name, model in models.items():
    result = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(result)

# Display results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.round(3))

Cross-Validation Metrics

from sklearn.model_selection import cross_validate

# Multiple metrics with cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv_results = cross_validate(
    LogisticRegression(random_state=42), 
    X_train, y_train, 
    cv=5, 
    scoring=scoring
)

print("Cross-Validation Results:")
for metric in scoring:
    scores = cv_results[f'test_{metric}']
    print(f"{metric.title()}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

Business-Specific Metrics

def calculate_business_metrics(y_true, y_pred, cost_fp=10, cost_fn=50, revenue_tp=100):
    """Calculate business-specific metrics"""
    
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Business costs and revenues
    total_cost = fp * cost_fp + fn * cost_fn
    total_revenue = tp * revenue_tp
    net_profit = total_revenue - total_cost
    
    # Metrics
    cost_per_prediction = total_cost / len(y_true)
    profit_per_customer = net_profit / tp if tp > 0 else 0
    
    return {
        'Total Cost': total_cost,
        'Total Revenue': total_revenue,
        'Net Profit': net_profit,
        'Cost per Prediction': cost_per_prediction,
        'Profit per TP': profit_per_customer
    }

# Business evaluation
business_metrics = calculate_business_metrics(y_test, y_pred)
print("\nBusiness Metrics:")
for metric, value in business_metrics.items():
    print(f"{metric}: ${value:.2f}")

Metric Selection Guidelines

def recommend_metrics(problem_type, class_distribution=None):
    """Recommend appropriate metrics based on problem characteristics"""
    
    recommendations = {
        'balanced_classification': [
            'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc'
        ],
        'imbalanced_classification': [
            'precision', 'recall', 'f1_score', 'precision_recall_auc', 
            'balanced_accuracy'
        ],
        'multi_class': [
            'accuracy', 'macro_f1', 'weighted_f1', 'confusion_matrix'
        ],
        'regression': [
            'mse', 'rmse', 'mae', 'r2_score', 'mape'
        ],
        'ranking': [
            'ndcg', 'map', 'precision_at_k'
        ]
    }
    
    return recommendations.get(problem_type, ['accuracy'])

# Get recommendations
imbalanced_metrics = recommend_metrics('imbalanced_classification')
print("Recommended metrics for imbalanced data:")
print(imbalanced_metrics)

Metrics Best Practices

  • Classification: Use precision/recall for imbalanced data
  • Regression: RMSE for outlier sensitivity, MAE for robustness
  • Multi-class: Macro averaging for equal class importance
  • Business: Define custom metrics aligned with business goals
  • Always: Use cross-validation for reliable estimates

Master Model Evaluation

Explore statistical significance testing, learn A/B testing for models, and discover production monitoring metrics.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python