Navigation

Python

How to Handle Duplicate Rows in DataFrames

Clean messy datasets by identifying, analyzing, and strategically removing duplicate rows with pandas' powerful deduplication tools.

Table Of Contents

Data Quality Through Deduplication

Duplicate rows silently corrupt analysis results. Master pandas' duplicate handling functions to ensure data integrity and accurate insights from your datasets.

Identifying Duplicate Rows

import pandas as pd
import numpy as np

# Sample data with duplicates
customer_data = pd.DataFrame({
    'customer_id': [1, 2, 3, 2, 4, 1, 5],
    'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Bob Smith', 
             'Diana Prince', 'Alice Johnson', 'Eve Wilson'],
    'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 
              'bob@email.com', 'diana@email.com', 'alice@email.com', 'eve@email.com'],
    'age': [28, 35, 42, 35, 31, 28, 29],
    'city': ['NYC', 'LA', 'Chicago', 'LA', 'Boston', 'NYC', 'Seattle']
})

print("Original data:")
print(customer_data)

# Check for duplicate rows
print(f"\nTotal rows: {len(customer_data)}")
print(f"Duplicate rows: {customer_data.duplicated().sum()}")
print(f"Unique rows: {len(customer_data) - customer_data.duplicated().sum()}")

# Show which rows are duplicates
print("\nDuplicate row indicators:")
print(customer_data.duplicated())  # True = duplicate

# Display actual duplicate rows
duplicate_rows = customer_data[customer_data.duplicated()]
print("\nActual duplicate rows:")
print(duplicate_rows)

Strategic Duplicate Removal

# Remove exact duplicates (keep first occurrence)
cleaned_data = customer_data.drop_duplicates()
print("After removing exact duplicates:")
print(cleaned_data)
print(f"Rows removed: {len(customer_data) - len(cleaned_data)}")

# Keep last occurrence instead of first
keep_last = customer_data.drop_duplicates(keep='last')
print("\nKeeping last occurrence:")
print(keep_last)

# Remove all duplicates (keep none)
keep_none = customer_data.drop_duplicates(keep=False)
print(f"\nKeeping no duplicates: {len(keep_none)} rows")

# Remove duplicates based on specific columns
unique_customers = customer_data.drop_duplicates(subset=['customer_id'])
print("\nUnique customers (by ID):")
print(unique_customers)

# Remove duplicates based on multiple columns
unique_name_email = customer_data.drop_duplicates(subset=['name', 'email'])
print(f"\nUnique by name and email: {len(unique_name_email)} rows")

Advanced Duplicate Analysis

# Analyze duplicate patterns
def analyze_duplicates(df, subset=None):
    """Comprehensive duplicate analysis"""
    analysis = {}
    
    # Overall statistics
    total_rows = len(df)
    duplicate_mask = df.duplicated(subset=subset)
    duplicate_count = duplicate_mask.sum()
    
    analysis['total_rows'] = total_rows
    analysis['duplicate_rows'] = duplicate_count
    analysis['unique_rows'] = total_rows - duplicate_count
    analysis['duplicate_percentage'] = (duplicate_count / total_rows) * 100
    
    # Find groups of duplicates
    if subset:
        duplicate_groups = df[df.duplicated(subset=subset, keep=False)].groupby(subset).size()
    else:
        duplicate_groups = df[df.duplicated(keep=False)].groupby(list(df.columns)).size()
    
    analysis['duplicate_groups'] = duplicate_groups[duplicate_groups > 1]
    analysis['largest_duplicate_group'] = duplicate_groups.max() if len(duplicate_groups) > 0 else 0
    
    return analysis

# Analyze our customer data
customer_analysis = analyze_duplicates(customer_data)
print("=== Duplicate Analysis ===")
for key, value in customer_analysis.items():
    if key != 'duplicate_groups':
        print(f"{key}: {value}")

print("\nDuplicate groups:")
print(customer_analysis['duplicate_groups'])

# Analyze duplicates by specific columns
email_analysis = analyze_duplicates(customer_data, subset=['email'])
print(f"\nDuplicate emails: {email_analysis['duplicate_rows']}")

Handling Near-Duplicates

# Sample data with near-duplicates (slight variations)
messy_data = pd.DataFrame({
    'name': ['John Smith', 'john smith', 'John  Smith', 'Jane Doe', 'JANE DOE'],
    'phone': ['555-1234', '5551234', '555-1234', '555-5678', '555-5678'],
    'email': ['john@email.com', 'john@email.COM', 'john@email.com', 
              'jane@email.com', 'jane@Email.com'],
    'age': [30, 30, 30, 25, 25]
})

print("Messy data with near-duplicates:")
print(messy_data)

# Normalize data before duplicate detection
def normalize_data(df):
    """Normalize data for better duplicate detection"""
    df_normalized = df.copy()
    
    # Normalize text columns
    text_columns = df_normalized.select_dtypes(include=['object']).columns
    for col in text_columns:
        if col in ['name', 'email']:  # Specific handling for names and emails
            df_normalized[col] = (df_normalized[col]
                                 .str.lower()          # Convert to lowercase
                                 .str.strip()          # Remove leading/trailing spaces
                                 .str.replace(r'\s+', ' ', regex=True))  # Normalize spaces
        elif col == 'phone':  # Phone number normalization
            df_normalized[col] = (df_normalized[col]
                                 .str.replace(r'[^\d]', '', regex=True))  # Keep only digits
    
    return df_normalized

# Apply normalization
normalized_data = normalize_data(messy_data)
print("\nAfter normalization:")
print(normalized_data)

# Find duplicates in normalized data
normalized_duplicates = normalized_data.drop_duplicates()
print(f"\nUnique records after normalization: {len(normalized_duplicates)}")

Business Logic for Duplicate Resolution

# Transaction data with business rules for duplicates
transactions = pd.DataFrame({
    'transaction_id': ['T001', 'T002', 'T003', 'T002', 'T004', 'T001', 'T005'],
    'customer_id': [101, 102, 103, 102, 104, 101, 105],
    'amount': [100.00, 250.50, 75.25, 250.50, 180.00, 100.00, 90.75],
    'timestamp': pd.to_datetime([
        '2025-01-01 10:00:00', '2025-01-01 11:30:00', '2025-01-01 12:15:00',
        '2025-01-01 11:30:00', '2025-01-01 14:20:00', '2025-01-01 10:00:00',
        '2025-01-01 15:45:00'
    ]),
    'status': ['completed', 'completed', 'pending', 'completed', 'failed', 'completed', 'completed']
})

print("Transaction data:")
print(transactions)

def smart_duplicate_removal(df):
    """Business-logic-based duplicate removal"""
    
    # Sort by timestamp to handle time-based logic
    df_sorted = df.sort_values('timestamp')
    
    # Custom logic: for transaction duplicates, keep the one with best status
    status_priority = {'completed': 1, 'pending': 2, 'failed': 3}
    df_sorted['status_priority'] = df_sorted['status'].map(status_priority)
    
    # Keep the transaction with highest priority (lowest number)
    df_cleaned = (df_sorted
                  .sort_values(['transaction_id', 'status_priority'])
                  .drop_duplicates(subset=['transaction_id'], keep='first')
                  .drop('status_priority', axis=1)
                  .sort_values('timestamp'))
    
    return df_cleaned

# Apply smart duplicate removal
smart_cleaned = smart_duplicate_removal(transactions)
print("\nAfter smart duplicate removal:")
print(smart_cleaned)
print(f"Transactions removed: {len(transactions) - len(smart_cleaned)}")

Memory-Efficient Duplicate Handling

# For large datasets, memory-efficient approach
def memory_efficient_dedup(df, chunk_size=10000):
    """Handle duplicates in large datasets efficiently"""
    
    if len(df) <= chunk_size:
        return df.drop_duplicates()
    
    # Process in chunks and maintain seen records
    seen_hashes = set()
    unique_chunks = []
    
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        
        # Create hash for each row (for faster comparison)
        chunk_hashes = pd.util.hash_pandas_object(chunk, index=False)
        
        # Keep only unseen rows
        mask = ~chunk_hashes.isin(seen_hashes)
        unique_chunk = chunk[mask]
        
        # Update seen hashes
        seen_hashes.update(chunk_hashes[mask])
        unique_chunks.append(unique_chunk)
    
    return pd.concat(unique_chunks, ignore_index=True)

# Simulate large dataset
np.random.seed(42)
large_data = pd.DataFrame({
    'id': np.random.randint(1, 1000, 50000),
    'value': np.random.randn(50000),
    'category': np.random.choice(['A', 'B', 'C'], 50000)
})

# Add some intentional duplicates
duplicate_indices = np.random.choice(large_data.index, 5000, replace=False)
large_data = pd.concat([large_data, large_data.loc[duplicate_indices]])

print(f"Large dataset: {len(large_data)} rows")
efficient_cleaned = memory_efficient_dedup(large_data)
print(f"After efficient deduplication: {len(efficient_cleaned)} rows")

Duplicate Validation and Reporting

def create_duplicate_report(df, subset=None):
    """Generate comprehensive duplicate report"""
    
    report = {
        'summary': {},
        'duplicate_groups': [],
        'recommendations': []
    }
    
    # Basic statistics
    total_rows = len(df)
    duplicates = df.duplicated(subset=subset, keep=False)
    duplicate_count = duplicates.sum()
    
    report['summary'] = {
        'total_rows': total_rows,
        'duplicate_rows': duplicate_count,
        'unique_rows': total_rows - duplicate_count,
        'duplicate_percentage': round((duplicate_count / total_rows) * 100, 2)
    }
    
    # Find duplicate groups
    if duplicate_count > 0:
        if subset:
            groups = df[duplicates].groupby(subset)
        else:
            groups = df[duplicates].groupby(list(df.columns))
        
        for name, group in groups:
            if len(group) > 1:
                report['duplicate_groups'].append({
                    'key': name if isinstance(name, tuple) else (name,),
                    'count': len(group),
                    'indices': group.index.tolist()
                })
    
    # Recommendations
    if duplicate_count == 0:
        report['recommendations'].append("✅ No duplicates found. Data is clean.")
    elif duplicate_count < total_rows * 0.05:  # Less than 5%
        report['recommendations'].append("⚠️  Small number of duplicates. Review and remove.")
    else:
        report['recommendations'].append("🚨 High duplicate rate. Investigate data source quality.")
    
    return report

# Generate report for customer data
duplicate_report = create_duplicate_report(customer_data)
print("=== DUPLICATE REPORT ===")
print(f"Summary: {duplicate_report['summary']}")
print(f"Duplicate groups found: {len(duplicate_report['duplicate_groups'])}")
for rec in duplicate_report['recommendations']:
    print(rec)

# Validate cleaning results
cleaned_report = create_duplicate_report(cleaned_data)
print(f"\nAfter cleaning: {cleaned_report['summary']}")

Best Practices for Duplicate Handling

  1. Always analyze before removing - understand your duplicate patterns
  2. Consider business logic - not all duplicates should be removed the same way
  3. Document your decisions - track what duplicate removal strategy you used
  4. Validate results - ensure important data wasn't lost
  5. Handle near-duplicates - normalize data before duplicate detection

Master Data Quality

Explore advanced data validation techniques, learn data profiling methods, and discover automated data cleaning pipelines.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python