Table Of Contents
- Data Quality Through Deduplication
- Identifying Duplicate Rows
- Strategic Duplicate Removal
- Advanced Duplicate Analysis
- Handling Near-Duplicates
- Business Logic for Duplicate Resolution
- Memory-Efficient Duplicate Handling
- Duplicate Validation and Reporting
- Best Practices for Duplicate Handling
- Master Data Quality
Data Quality Through Deduplication
Duplicate rows silently corrupt analysis results. Master pandas' duplicate handling functions to ensure data integrity and accurate insights from your datasets.
Identifying Duplicate Rows
import pandas as pd
import numpy as np
# Sample data with duplicates
customer_data = pd.DataFrame({
'customer_id': [1, 2, 3, 2, 4, 1, 5],
'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Bob Smith',
'Diana Prince', 'Alice Johnson', 'Eve Wilson'],
'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com',
'bob@email.com', 'diana@email.com', 'alice@email.com', 'eve@email.com'],
'age': [28, 35, 42, 35, 31, 28, 29],
'city': ['NYC', 'LA', 'Chicago', 'LA', 'Boston', 'NYC', 'Seattle']
})
print("Original data:")
print(customer_data)
# Check for duplicate rows
print(f"\nTotal rows: {len(customer_data)}")
print(f"Duplicate rows: {customer_data.duplicated().sum()}")
print(f"Unique rows: {len(customer_data) - customer_data.duplicated().sum()}")
# Show which rows are duplicates
print("\nDuplicate row indicators:")
print(customer_data.duplicated()) # True = duplicate
# Display actual duplicate rows
duplicate_rows = customer_data[customer_data.duplicated()]
print("\nActual duplicate rows:")
print(duplicate_rows)
Strategic Duplicate Removal
# Remove exact duplicates (keep first occurrence)
cleaned_data = customer_data.drop_duplicates()
print("After removing exact duplicates:")
print(cleaned_data)
print(f"Rows removed: {len(customer_data) - len(cleaned_data)}")
# Keep last occurrence instead of first
keep_last = customer_data.drop_duplicates(keep='last')
print("\nKeeping last occurrence:")
print(keep_last)
# Remove all duplicates (keep none)
keep_none = customer_data.drop_duplicates(keep=False)
print(f"\nKeeping no duplicates: {len(keep_none)} rows")
# Remove duplicates based on specific columns
unique_customers = customer_data.drop_duplicates(subset=['customer_id'])
print("\nUnique customers (by ID):")
print(unique_customers)
# Remove duplicates based on multiple columns
unique_name_email = customer_data.drop_duplicates(subset=['name', 'email'])
print(f"\nUnique by name and email: {len(unique_name_email)} rows")
Advanced Duplicate Analysis
# Analyze duplicate patterns
def analyze_duplicates(df, subset=None):
"""Comprehensive duplicate analysis"""
analysis = {}
# Overall statistics
total_rows = len(df)
duplicate_mask = df.duplicated(subset=subset)
duplicate_count = duplicate_mask.sum()
analysis['total_rows'] = total_rows
analysis['duplicate_rows'] = duplicate_count
analysis['unique_rows'] = total_rows - duplicate_count
analysis['duplicate_percentage'] = (duplicate_count / total_rows) * 100
# Find groups of duplicates
if subset:
duplicate_groups = df[df.duplicated(subset=subset, keep=False)].groupby(subset).size()
else:
duplicate_groups = df[df.duplicated(keep=False)].groupby(list(df.columns)).size()
analysis['duplicate_groups'] = duplicate_groups[duplicate_groups > 1]
analysis['largest_duplicate_group'] = duplicate_groups.max() if len(duplicate_groups) > 0 else 0
return analysis
# Analyze our customer data
customer_analysis = analyze_duplicates(customer_data)
print("=== Duplicate Analysis ===")
for key, value in customer_analysis.items():
if key != 'duplicate_groups':
print(f"{key}: {value}")
print("\nDuplicate groups:")
print(customer_analysis['duplicate_groups'])
# Analyze duplicates by specific columns
email_analysis = analyze_duplicates(customer_data, subset=['email'])
print(f"\nDuplicate emails: {email_analysis['duplicate_rows']}")
Handling Near-Duplicates
# Sample data with near-duplicates (slight variations)
messy_data = pd.DataFrame({
'name': ['John Smith', 'john smith', 'John Smith', 'Jane Doe', 'JANE DOE'],
'phone': ['555-1234', '5551234', '555-1234', '555-5678', '555-5678'],
'email': ['john@email.com', 'john@email.COM', 'john@email.com',
'jane@email.com', 'jane@Email.com'],
'age': [30, 30, 30, 25, 25]
})
print("Messy data with near-duplicates:")
print(messy_data)
# Normalize data before duplicate detection
def normalize_data(df):
"""Normalize data for better duplicate detection"""
df_normalized = df.copy()
# Normalize text columns
text_columns = df_normalized.select_dtypes(include=['object']).columns
for col in text_columns:
if col in ['name', 'email']: # Specific handling for names and emails
df_normalized[col] = (df_normalized[col]
.str.lower() # Convert to lowercase
.str.strip() # Remove leading/trailing spaces
.str.replace(r'\s+', ' ', regex=True)) # Normalize spaces
elif col == 'phone': # Phone number normalization
df_normalized[col] = (df_normalized[col]
.str.replace(r'[^\d]', '', regex=True)) # Keep only digits
return df_normalized
# Apply normalization
normalized_data = normalize_data(messy_data)
print("\nAfter normalization:")
print(normalized_data)
# Find duplicates in normalized data
normalized_duplicates = normalized_data.drop_duplicates()
print(f"\nUnique records after normalization: {len(normalized_duplicates)}")
Business Logic for Duplicate Resolution
# Transaction data with business rules for duplicates
transactions = pd.DataFrame({
'transaction_id': ['T001', 'T002', 'T003', 'T002', 'T004', 'T001', 'T005'],
'customer_id': [101, 102, 103, 102, 104, 101, 105],
'amount': [100.00, 250.50, 75.25, 250.50, 180.00, 100.00, 90.75],
'timestamp': pd.to_datetime([
'2025-01-01 10:00:00', '2025-01-01 11:30:00', '2025-01-01 12:15:00',
'2025-01-01 11:30:00', '2025-01-01 14:20:00', '2025-01-01 10:00:00',
'2025-01-01 15:45:00'
]),
'status': ['completed', 'completed', 'pending', 'completed', 'failed', 'completed', 'completed']
})
print("Transaction data:")
print(transactions)
def smart_duplicate_removal(df):
"""Business-logic-based duplicate removal"""
# Sort by timestamp to handle time-based logic
df_sorted = df.sort_values('timestamp')
# Custom logic: for transaction duplicates, keep the one with best status
status_priority = {'completed': 1, 'pending': 2, 'failed': 3}
df_sorted['status_priority'] = df_sorted['status'].map(status_priority)
# Keep the transaction with highest priority (lowest number)
df_cleaned = (df_sorted
.sort_values(['transaction_id', 'status_priority'])
.drop_duplicates(subset=['transaction_id'], keep='first')
.drop('status_priority', axis=1)
.sort_values('timestamp'))
return df_cleaned
# Apply smart duplicate removal
smart_cleaned = smart_duplicate_removal(transactions)
print("\nAfter smart duplicate removal:")
print(smart_cleaned)
print(f"Transactions removed: {len(transactions) - len(smart_cleaned)}")
Memory-Efficient Duplicate Handling
# For large datasets, memory-efficient approach
def memory_efficient_dedup(df, chunk_size=10000):
"""Handle duplicates in large datasets efficiently"""
if len(df) <= chunk_size:
return df.drop_duplicates()
# Process in chunks and maintain seen records
seen_hashes = set()
unique_chunks = []
for start in range(0, len(df), chunk_size):
chunk = df.iloc[start:start + chunk_size]
# Create hash for each row (for faster comparison)
chunk_hashes = pd.util.hash_pandas_object(chunk, index=False)
# Keep only unseen rows
mask = ~chunk_hashes.isin(seen_hashes)
unique_chunk = chunk[mask]
# Update seen hashes
seen_hashes.update(chunk_hashes[mask])
unique_chunks.append(unique_chunk)
return pd.concat(unique_chunks, ignore_index=True)
# Simulate large dataset
np.random.seed(42)
large_data = pd.DataFrame({
'id': np.random.randint(1, 1000, 50000),
'value': np.random.randn(50000),
'category': np.random.choice(['A', 'B', 'C'], 50000)
})
# Add some intentional duplicates
duplicate_indices = np.random.choice(large_data.index, 5000, replace=False)
large_data = pd.concat([large_data, large_data.loc[duplicate_indices]])
print(f"Large dataset: {len(large_data)} rows")
efficient_cleaned = memory_efficient_dedup(large_data)
print(f"After efficient deduplication: {len(efficient_cleaned)} rows")
Duplicate Validation and Reporting
def create_duplicate_report(df, subset=None):
"""Generate comprehensive duplicate report"""
report = {
'summary': {},
'duplicate_groups': [],
'recommendations': []
}
# Basic statistics
total_rows = len(df)
duplicates = df.duplicated(subset=subset, keep=False)
duplicate_count = duplicates.sum()
report['summary'] = {
'total_rows': total_rows,
'duplicate_rows': duplicate_count,
'unique_rows': total_rows - duplicate_count,
'duplicate_percentage': round((duplicate_count / total_rows) * 100, 2)
}
# Find duplicate groups
if duplicate_count > 0:
if subset:
groups = df[duplicates].groupby(subset)
else:
groups = df[duplicates].groupby(list(df.columns))
for name, group in groups:
if len(group) > 1:
report['duplicate_groups'].append({
'key': name if isinstance(name, tuple) else (name,),
'count': len(group),
'indices': group.index.tolist()
})
# Recommendations
if duplicate_count == 0:
report['recommendations'].append("✅ No duplicates found. Data is clean.")
elif duplicate_count < total_rows * 0.05: # Less than 5%
report['recommendations'].append("⚠️ Small number of duplicates. Review and remove.")
else:
report['recommendations'].append("🚨 High duplicate rate. Investigate data source quality.")
return report
# Generate report for customer data
duplicate_report = create_duplicate_report(customer_data)
print("=== DUPLICATE REPORT ===")
print(f"Summary: {duplicate_report['summary']}")
print(f"Duplicate groups found: {len(duplicate_report['duplicate_groups'])}")
for rec in duplicate_report['recommendations']:
print(rec)
# Validate cleaning results
cleaned_report = create_duplicate_report(cleaned_data)
print(f"\nAfter cleaning: {cleaned_report['summary']}")
Best Practices for Duplicate Handling
- Always analyze before removing - understand your duplicate patterns
- Consider business logic - not all duplicates should be removed the same way
- Document your decisions - track what duplicate removal strategy you used
- Validate results - ensure important data wasn't lost
- Handle near-duplicates - normalize data before duplicate detection
Master Data Quality
Explore advanced data validation techniques, learn data profiling methods, and discover automated data cleaning pipelines.
Share this article
Add Comment
No comments yet. Be the first to comment!