Table Of Contents
- Missing Data, Not Missing Insights
- Understanding Missing Data Types
- Strategic Data Dropping with dropna()
- Smart Data Filling with fillna()
- Advanced Missing Data Strategies
- Handling Missing Data in Time Series
- Custom Missing Data Handling
- Validation After Cleaning
- Best Practices
- Master Data Quality
Missing Data, Not Missing Insights
Real-world data has gaps. Pandas' fillna() and dropna() functions turn incomplete datasets into actionable insights through smart missing data strategies.
Understanding Missing Data Types
import pandas as pd
import numpy as np
# Create sample data with missing values
data = {
'name': ['Alice', 'Bob', None, 'David', 'Eve'],
'age': [25, np.nan, 35, 28, np.nan],
'salary': [50000, 60000, np.nan, 55000, 70000],
'department': ['IT', 'HR', 'IT', None, 'Finance']
}
df = pd.DataFrame(data)
# Identify missing data
print("Missing data overview:")
print(df.isnull().sum()) # Count missing values per column
print(f"\nTotal missing: {df.isnull().sum().sum()}")
print(f"Missing percentage: {df.isnull().sum().sum() / df.size * 100:.1f}%")
# Visualize missing patterns
print("\nMissing data pattern:")
print(df.isnull()) # True = missing, False = not missing
Strategic Data Dropping with dropna()
# Drop rows with ANY missing values
df_drop_any = df.dropna()
print(f"Original rows: {len(df)}, After dropna(): {len(df_drop_any)}")
# Drop rows with ALL missing values
df_all_missing = pd.DataFrame({
'A': [1, np.nan, 3],
'B': [np.nan, np.nan, 6],
'C': [7, np.nan, 9]
})
df_drop_all = df_all_missing.dropna(how='all') # Keeps rows with some data
print(df_drop_all)
# Drop based on specific columns
df_drop_subset = df.dropna(subset=['age', 'salary']) # Must have both
print(f"After dropping missing age/salary: {len(df_drop_subset)} rows")
# Threshold-based dropping
df_threshold = df.dropna(thresh=3) # Keep rows with at least 3 non-null values
print(f"After threshold=3: {len(df_threshold)} rows")
# Drop columns with missing data
df_drop_cols = df.dropna(axis=1) # axis=1 for columns
print(f"Columns after dropping: {list(df_drop_cols.columns)}")
Smart Data Filling with fillna()
# Fill with constant values
df_filled_zero = df.fillna(0) # Replace all NaN with 0
df_filled_dict = df.fillna({
'age': df['age'].mean(), # Fill age with mean
'salary': df['salary'].median(), # Fill salary with median
'name': 'Unknown', # Fill name with string
'department': 'General' # Fill department with default
})
print(df_filled_dict)
# Forward and backward filling
time_series = pd.DataFrame({
'date': pd.date_range('2025-01-01', periods=7),
'value': [10, np.nan, np.nan, 20, np.nan, 30, np.nan]
})
# Forward fill (use last valid value)
time_series['ffill'] = time_series['value'].fillna(method='ffill')
# Backward fill (use next valid value)
time_series['bfill'] = time_series['value'].fillna(method='bfill')
# Interpolation (smart filling)
time_series['interpolated'] = time_series['value'].interpolate()
print(time_series)
Advanced Missing Data Strategies
# Group-based filling
sales_data = pd.DataFrame({
'region': ['North', 'North', 'South', 'South', 'East', 'East'],
'product': ['A', 'B', 'A', 'B', 'A', 'B'],
'sales': [100, np.nan, 150, 200, np.nan, 180],
'costs': [50, 45, np.nan, 80, 60, np.nan]
})
# Fill missing sales with regional average
sales_filled = sales_data.copy()
sales_filled['sales'] = sales_filled.groupby('region')['sales'].transform(
lambda x: x.fillna(x.mean())
)
# Fill missing costs with product average
sales_filled['costs'] = sales_filled.groupby('product')['costs'].transform(
lambda x: x.fillna(x.mean())
)
print(sales_filled)
# Multiple imputation strategies
def smart_impute(df, column, strategy='mean'):
"""Intelligent imputation based on data type and distribution"""
if df[column].dtype in ['int64', 'float64']:
if strategy == 'mean':
return df[column].fillna(df[column].mean())
elif strategy == 'median':
return df[column].fillna(df[column].median())
elif strategy == 'mode':
return df[column].fillna(df[column].mode().iloc[0])
else: # Categorical data
return df[column].fillna(df[column].mode().iloc[0])
# Apply smart imputation
df_smart = df.copy()
df_smart['age'] = smart_impute(df_smart, 'age', 'median')
df_smart['salary'] = smart_impute(df_smart, 'salary', 'mean')
df_smart['department'] = smart_impute(df_smart, 'department', 'mode')
Handling Missing Data in Time Series
# Create time series with gaps
dates = pd.date_range('2025-01-01', periods=30, freq='D')
values = np.random.randn(30)
values[[5, 10, 15, 20, 25]] = np.nan # Introduce missing values
ts_df = pd.DataFrame({'date': dates, 'value': values})
ts_df.set_index('date', inplace=True)
# Time-aware interpolation methods
ts_df['linear'] = ts_df['value'].interpolate(method='linear')
ts_df['time'] = ts_df['value'].interpolate(method='time') # Time-weighted
ts_df['polynomial'] = ts_df['value'].interpolate(method='polynomial', order=2)
# Seasonal patterns (for longer time series)
ts_df['seasonal'] = ts_df['value'].fillna(method='ffill').fillna(method='bfill')
print(ts_df.head(10))
Custom Missing Data Handling
def analyze_missing_patterns(df):
"""Comprehensive missing data analysis"""
analysis = {}
# Overall statistics
analysis['total_missing'] = df.isnull().sum().sum()
analysis['missing_percent'] = (df.isnull().sum().sum() / df.size) * 100
# Per column analysis
analysis['column_missing'] = df.isnull().sum().to_dict()
analysis['column_missing_percent'] = (df.isnull().sum() / len(df) * 100).to_dict()
# Missing combinations
missing_combinations = df.isnull().value_counts()
analysis['missing_patterns'] = missing_combinations.to_dict()
return analysis
def handle_missing_data(df, strategy='auto'):
"""Automatic missing data handling based on analysis"""
df_clean = df.copy()
for column in df_clean.columns:
missing_pct = df_clean[column].isnull().sum() / len(df_clean) * 100
if missing_pct > 50: # Too much missing data
print(f"Dropping {column}: {missing_pct:.1f}% missing")
df_clean = df_clean.drop(columns=[column])
elif missing_pct > 0: # Some missing data
if df_clean[column].dtype in ['int64', 'float64']:
# Numeric: use median
fill_value = df_clean[column].median()
df_clean[column] = df_clean[column].fillna(fill_value)
print(f"Filled {column} with median: {fill_value}")
else:
# Categorical: use mode
fill_value = df_clean[column].mode().iloc[0] if not df_clean[column].mode().empty else 'Unknown'
df_clean[column] = df_clean[column].fillna(fill_value)
print(f"Filled {column} with mode: {fill_value}")
return df_clean
# Usage
analysis = analyze_missing_patterns(df)
print("Missing data analysis:", analysis)
df_auto_cleaned = handle_missing_data(df)
print(f"\nCleaned data shape: {df_auto_cleaned.shape}")
Validation After Cleaning
def validate_cleaning(original_df, cleaned_df):
"""Validate data cleaning results"""
print("Data Cleaning Validation Report")
print("=" * 40)
# Shape comparison
print(f"Original shape: {original_df.shape}")
print(f"Cleaned shape: {cleaned_df.shape}")
print(f"Rows removed: {original_df.shape[0] - cleaned_df.shape[0]}")
print(f"Columns removed: {original_df.shape[1] - cleaned_df.shape[1]}")
# Missing data comparison
orig_missing = original_df.isnull().sum().sum()
clean_missing = cleaned_df.isnull().sum().sum()
print(f"\nMissing values - Original: {orig_missing}, Cleaned: {clean_missing}")
# Data integrity checks
if cleaned_df.isnull().sum().sum() == 0:
print("✅ No missing values remaining")
else:
print("⚠️ Some missing values still present")
return {
'rows_kept': cleaned_df.shape[0] / original_df.shape[0],
'data_completeness': 1 - (clean_missing / cleaned_df.size)
}
# Validate our cleaning
validation = validate_cleaning(df, df_auto_cleaned)
print(f"\nData retention: {validation['rows_kept']:.2%}")
print(f"Data completeness: {validation['data_completeness']:.2%}")
Best Practices
- Analyze before acting - understand your missing data patterns
- Consider domain knowledge - some missing values are meaningful
- Document your decisions - track what imputation methods you used
- Validate results - ensure cleaning doesn't introduce bias
- Keep originals - always work on copies of your data
Master Data Quality
Explore advanced data validation techniques, learn statistical imputation methods, and discover automated data quality pipelines.
Share this article
Add Comment
No comments yet. Be the first to comment!