How to Evaluate Model Performance with Metrics

Beyond Accuracy
Classification Metrics
Confusion Matrix Analysis
Multi-class Classification Metrics
ROC and Precision-Recall Curves
Regression Metrics
Custom Metrics
Model Comparison Framework
Cross-Validation Metrics
Business-Specific Metrics
Metric Selection Guidelines
Metrics Best Practices
Master Model Evaluation

Beyond Accuracy

Accuracy alone doesn't tell the full story. Master comprehensive evaluation metrics to truly understand your model's strengths and weaknesses.

Classification Metrics

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

# Generate sample data
X, y = make_classification(n_samples=1000, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Basic metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.3f}")

Confusion Matrix Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Calculate metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

# Derived metrics
sensitivity = tp / (tp + fn)  # Recall
specificity = tn / (tn + fp)
precision = tp / (tp + fp)

print(f"\nSensitivity (Recall): {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"Precision: {precision:.3f}")

Multi-class Classification Metrics

from sklearn.datasets import make_classification

# Multi-class data
X_multi, y_multi = make_classification(
    n_samples=1000, n_classes=3, n_informative=10, random_state=42
)
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Train multi-class model
multi_model = LogisticRegression(random_state=42, max_iter=1000)
multi_model.fit(X_train_multi, y_train_multi)
y_pred_multi = multi_model.predict(X_test_multi)

# Multi-class metrics with different averaging
print("Multi-class Metrics:")
print(f"Accuracy: {accuracy_score(y_test_multi, y_pred_multi):.3f}")
print(f"Precision (macro): {precision_score(y_test_multi, y_pred_multi, average='macro'):.3f}")
print(f"Precision (weighted): {precision_score(y_test_multi, y_pred_multi, average='weighted'):.3f}")
print(f"F1 (macro): {f1_score(y_test_multi, y_pred_multi, average='macro'):.3f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_multi, y_pred_multi))

ROC and Precision-Recall Curves

from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"ROC AUC: {roc_auc:.3f}")

# Precision-Recall Curve
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_prob)
avg_precision = average_precision_score(y_test, y_prob)

print(f"Average Precision: {avg_precision:.3f}")

# Find optimal threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold_idx = np.argmax(f1_scores)
best_threshold = pr_thresholds[best_threshold_idx]

print(f"Best threshold: {best_threshold:.3f}")
print(f"Best F1 score: {f1_scores[best_threshold_idx]:.3f}")

Regression Metrics

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error
)
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

# Regression data
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train regression model
reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)

# Regression metrics
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
mape = mean_absolute_percentage_error(y_test_reg, y_pred_reg)

print("Regression Metrics:")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"MAPE: {mape:.3f}")

Custom Metrics

from sklearn.metrics import make_scorer

def custom_accuracy_score(y_true, y_pred):
    """Custom metric: penalize false positives more"""
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Custom scoring: penalize FP twice as much as FN
    penalty = fn + 2 * fp
    total = len(y_true)
    
    return 1 - (penalty / total)

# Use custom metric
custom_score = custom_accuracy_score(y_test, y_pred)
print(f"Custom score: {custom_score:.3f}")

# Create scorer for cross-validation
custom_scorer = make_scorer(custom_accuracy_score)

Model Comparison Framework

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    
    # Fit model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Get probabilities if available
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_prob)
    else:
        auc_score = None
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': auc_score
    }
    
    return metrics

# Compare multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

results = []
for name, model in models.items():
    result = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(result)

# Display results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.round(3))

Cross-Validation Metrics

from sklearn.model_selection import cross_validate

# Multiple metrics with cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

cv_results = cross_validate(
    LogisticRegression(random_state=42), 
    X_train, y_train, 
    cv=5, 
    scoring=scoring
)

print("Cross-Validation Results:")
for metric in scoring:
    scores = cv_results[f'test_{metric}']
    print(f"{metric.title()}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

Business-Specific Metrics

def calculate_business_metrics(y_true, y_pred, cost_fp=10, cost_fn=50, revenue_tp=100):
    """Calculate business-specific metrics"""
    
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Business costs and revenues
    total_cost = fp * cost_fp + fn * cost_fn
    total_revenue = tp * revenue_tp
    net_profit = total_revenue - total_cost
    
    # Metrics
    cost_per_prediction = total_cost / len(y_true)
    profit_per_customer = net_profit / tp if tp > 0 else 0
    
    return {
        'Total Cost': total_cost,
        'Total Revenue': total_revenue,
        'Net Profit': net_profit,
        'Cost per Prediction': cost_per_prediction,
        'Profit per TP': profit_per_customer
    }

# Business evaluation
business_metrics = calculate_business_metrics(y_test, y_pred)
print("\nBusiness Metrics:")
for metric, value in business_metrics.items():
    print(f"{metric}: ${value:.2f}")

Metric Selection Guidelines

def recommend_metrics(problem_type, class_distribution=None):
    """Recommend appropriate metrics based on problem characteristics"""
    
    recommendations = {
        'balanced_classification': [
            'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc'
        ],
        'imbalanced_classification': [
            'precision', 'recall', 'f1_score', 'precision_recall_auc', 
            'balanced_accuracy'
        ],
        'multi_class': [
            'accuracy', 'macro_f1', 'weighted_f1', 'confusion_matrix'
        ],
        'regression': [
            'mse', 'rmse', 'mae', 'r2_score', 'mape'
        ],
        'ranking': [
            'ndcg', 'map', 'precision_at_k'
        ]
    }
    
    return recommendations.get(problem_type, ['accuracy'])

# Get recommendations
imbalanced_metrics = recommend_metrics('imbalanced_classification')
print("Recommended metrics for imbalanced data:")
print(imbalanced_metrics)

Metrics Best Practices

Classification: Use precision/recall for imbalanced data
Regression: RMSE for outlier sensitivity, MAE for robustness
Multi-class: Macro averaging for equal class importance
Business: Define custom metrics aligned with business goals
Always: Use cross-validation for reliable estimates

Master Model Evaluation

Explore statistical significance testing, learn A/B testing for models, and discover production monitoring metrics.

Share this article

Navigation

How to Evaluate Model Performance with Metrics

Table Of Contents

Beyond Accuracy

Classification Metrics

Confusion Matrix Analysis

Multi-class Classification Metrics

ROC and Precision-Recall Curves

Regression Metrics

Custom Metrics

Model Comparison Framework

Cross-Validation Metrics

Business-Specific Metrics

Metric Selection Guidelines

Metrics Best Practices

Master Model Evaluation

Add Comment

More from Python

Navigation

Table Of Contents

Beyond Accuracy

Classification Metrics

Confusion Matrix Analysis

Multi-class Classification Metrics

ROC and Precision-Recall Curves

Regression Metrics

Custom Metrics

Model Comparison Framework

Cross-Validation Metrics

Business-Specific Metrics

Metric Selection Guidelines

Metrics Best Practices

Master Model Evaluation

Comments

Add Comment

More from Python

How to Handle Large Datasets with Chunking

Python Slicing Like a Pro: Master Advanced List and String Slicing Techniques in 2025

Python functools.partial: Creating Specialized Functions for Cleaner Code

Python JSON Module: Complete Guide to JSON Manipulation in 2025

How to Perform Element-wise Operations in NumPy

Error Handling Exception Management 2025 Complete Guide Try Catch Finally Python JavaScript Java