Navigation

Python

How to Handle Missing Data with fillna() and dropna()

Transform messy datasets into analysis-ready data with pandas' powerful missing data handling functions - your data cleaning toolkit.

Table Of Contents

Missing Data, Not Missing Insights

Real-world data has gaps. Pandas' fillna() and dropna() functions turn incomplete datasets into actionable insights through smart missing data strategies.

Understanding Missing Data Types

import pandas as pd
import numpy as np

# Create sample data with missing values
data = {
    'name': ['Alice', 'Bob', None, 'David', 'Eve'],
    'age': [25, np.nan, 35, 28, np.nan],
    'salary': [50000, 60000, np.nan, 55000, 70000],
    'department': ['IT', 'HR', 'IT', None, 'Finance']
}
df = pd.DataFrame(data)

# Identify missing data
print("Missing data overview:")
print(df.isnull().sum())        # Count missing values per column
print(f"\nTotal missing: {df.isnull().sum().sum()}")
print(f"Missing percentage: {df.isnull().sum().sum() / df.size * 100:.1f}%")

# Visualize missing patterns
print("\nMissing data pattern:")
print(df.isnull())  # True = missing, False = not missing

Strategic Data Dropping with dropna()

# Drop rows with ANY missing values
df_drop_any = df.dropna()
print(f"Original rows: {len(df)}, After dropna(): {len(df_drop_any)}")

# Drop rows with ALL missing values
df_all_missing = pd.DataFrame({
    'A': [1, np.nan, 3],
    'B': [np.nan, np.nan, 6],
    'C': [7, np.nan, 9]
})
df_drop_all = df_all_missing.dropna(how='all')  # Keeps rows with some data
print(df_drop_all)

# Drop based on specific columns
df_drop_subset = df.dropna(subset=['age', 'salary'])  # Must have both
print(f"After dropping missing age/salary: {len(df_drop_subset)} rows")

# Threshold-based dropping
df_threshold = df.dropna(thresh=3)  # Keep rows with at least 3 non-null values
print(f"After threshold=3: {len(df_threshold)} rows")

# Drop columns with missing data
df_drop_cols = df.dropna(axis=1)  # axis=1 for columns
print(f"Columns after dropping: {list(df_drop_cols.columns)}")

Smart Data Filling with fillna()

# Fill with constant values
df_filled_zero = df.fillna(0)  # Replace all NaN with 0
df_filled_dict = df.fillna({
    'age': df['age'].mean(),           # Fill age with mean
    'salary': df['salary'].median(),   # Fill salary with median  
    'name': 'Unknown',                 # Fill name with string
    'department': 'General'            # Fill department with default
})
print(df_filled_dict)

# Forward and backward filling
time_series = pd.DataFrame({
    'date': pd.date_range('2025-01-01', periods=7),
    'value': [10, np.nan, np.nan, 20, np.nan, 30, np.nan]
})

# Forward fill (use last valid value)
time_series['ffill'] = time_series['value'].fillna(method='ffill')

# Backward fill (use next valid value) 
time_series['bfill'] = time_series['value'].fillna(method='bfill')

# Interpolation (smart filling)
time_series['interpolated'] = time_series['value'].interpolate()
print(time_series)

Advanced Missing Data Strategies

# Group-based filling
sales_data = pd.DataFrame({
    'region': ['North', 'North', 'South', 'South', 'East', 'East'],
    'product': ['A', 'B', 'A', 'B', 'A', 'B'],
    'sales': [100, np.nan, 150, 200, np.nan, 180],
    'costs': [50, 45, np.nan, 80, 60, np.nan]
})

# Fill missing sales with regional average
sales_filled = sales_data.copy()
sales_filled['sales'] = sales_filled.groupby('region')['sales'].transform(
    lambda x: x.fillna(x.mean())
)

# Fill missing costs with product average
sales_filled['costs'] = sales_filled.groupby('product')['costs'].transform(
    lambda x: x.fillna(x.mean())
)
print(sales_filled)

# Multiple imputation strategies
def smart_impute(df, column, strategy='mean'):
    """Intelligent imputation based on data type and distribution"""
    if df[column].dtype in ['int64', 'float64']:
        if strategy == 'mean':
            return df[column].fillna(df[column].mean())
        elif strategy == 'median':
            return df[column].fillna(df[column].median())
        elif strategy == 'mode':
            return df[column].fillna(df[column].mode().iloc[0])
    else:  # Categorical data
        return df[column].fillna(df[column].mode().iloc[0])

# Apply smart imputation
df_smart = df.copy()
df_smart['age'] = smart_impute(df_smart, 'age', 'median')
df_smart['salary'] = smart_impute(df_smart, 'salary', 'mean')
df_smart['department'] = smart_impute(df_smart, 'department', 'mode')

Handling Missing Data in Time Series

# Create time series with gaps
dates = pd.date_range('2025-01-01', periods=30, freq='D')
values = np.random.randn(30)
values[[5, 10, 15, 20, 25]] = np.nan  # Introduce missing values

ts_df = pd.DataFrame({'date': dates, 'value': values})
ts_df.set_index('date', inplace=True)

# Time-aware interpolation methods
ts_df['linear'] = ts_df['value'].interpolate(method='linear')
ts_df['time'] = ts_df['value'].interpolate(method='time')  # Time-weighted
ts_df['polynomial'] = ts_df['value'].interpolate(method='polynomial', order=2)

# Seasonal patterns (for longer time series)
ts_df['seasonal'] = ts_df['value'].fillna(method='ffill').fillna(method='bfill')
print(ts_df.head(10))

Custom Missing Data Handling

def analyze_missing_patterns(df):
    """Comprehensive missing data analysis"""
    analysis = {}
    
    # Overall statistics
    analysis['total_missing'] = df.isnull().sum().sum()
    analysis['missing_percent'] = (df.isnull().sum().sum() / df.size) * 100
    
    # Per column analysis
    analysis['column_missing'] = df.isnull().sum().to_dict()
    analysis['column_missing_percent'] = (df.isnull().sum() / len(df) * 100).to_dict()
    
    # Missing combinations
    missing_combinations = df.isnull().value_counts()
    analysis['missing_patterns'] = missing_combinations.to_dict()
    
    return analysis

def handle_missing_data(df, strategy='auto'):
    """Automatic missing data handling based on analysis"""
    df_clean = df.copy()
    
    for column in df_clean.columns:
        missing_pct = df_clean[column].isnull().sum() / len(df_clean) * 100
        
        if missing_pct > 50:  # Too much missing data
            print(f"Dropping {column}: {missing_pct:.1f}% missing")
            df_clean = df_clean.drop(columns=[column])
            
        elif missing_pct > 0:  # Some missing data
            if df_clean[column].dtype in ['int64', 'float64']:
                # Numeric: use median
                fill_value = df_clean[column].median()
                df_clean[column] = df_clean[column].fillna(fill_value)
                print(f"Filled {column} with median: {fill_value}")
            else:
                # Categorical: use mode
                fill_value = df_clean[column].mode().iloc[0] if not df_clean[column].mode().empty else 'Unknown'
                df_clean[column] = df_clean[column].fillna(fill_value)
                print(f"Filled {column} with mode: {fill_value}")
    
    return df_clean

# Usage
analysis = analyze_missing_patterns(df)
print("Missing data analysis:", analysis)

df_auto_cleaned = handle_missing_data(df)
print(f"\nCleaned data shape: {df_auto_cleaned.shape}")

Validation After Cleaning

def validate_cleaning(original_df, cleaned_df):
    """Validate data cleaning results"""
    print("Data Cleaning Validation Report")
    print("=" * 40)
    
    # Shape comparison
    print(f"Original shape: {original_df.shape}")
    print(f"Cleaned shape: {cleaned_df.shape}")
    print(f"Rows removed: {original_df.shape[0] - cleaned_df.shape[0]}")
    print(f"Columns removed: {original_df.shape[1] - cleaned_df.shape[1]}")
    
    # Missing data comparison
    orig_missing = original_df.isnull().sum().sum()
    clean_missing = cleaned_df.isnull().sum().sum()
    print(f"\nMissing values - Original: {orig_missing}, Cleaned: {clean_missing}")
    
    # Data integrity checks
    if cleaned_df.isnull().sum().sum() == 0:
        print("✅ No missing values remaining")
    else:
        print("⚠️  Some missing values still present")
    
    return {
        'rows_kept': cleaned_df.shape[0] / original_df.shape[0],
        'data_completeness': 1 - (clean_missing / cleaned_df.size)
    }

# Validate our cleaning
validation = validate_cleaning(df, df_auto_cleaned)
print(f"\nData retention: {validation['rows_kept']:.2%}")
print(f"Data completeness: {validation['data_completeness']:.2%}")

Best Practices

  • Analyze before acting - understand your missing data patterns
  • Consider domain knowledge - some missing values are meaningful
  • Document your decisions - track what imputation methods you used
  • Validate results - ensure cleaning doesn't introduce bias
  • Keep originals - always work on copies of your data

Master Data Quality

Explore advanced data validation techniques, learn statistical imputation methods, and discover automated data quality pipelines.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python