Navigation

Python

How to Use loc vs iloc for Data Selection

Master pandas' most powerful data selection tools - loc for label-based selection and iloc for position-based indexing.

Table Of Contents

Precision in Data Selection

Stop fumbling with data selection. Understanding loc and iloc transforms you from a pandas novice to a data manipulation expert with surgical precision.

The Fundamental Difference

import pandas as pd
import numpy as np

# Sample DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'salary': [50000, 60000, 75000, 55000, 68000],
    'department': ['IT', 'HR', 'Engineering', 'IT', 'HR']
}, index=['emp001', 'emp002', 'emp003', 'emp004', 'emp005'])

print(df)

# loc: Label-based selection (uses index/column names)
print("\n=== loc Examples ===")
print(df.loc['emp001'])                    # Single row by index label
print(df.loc['emp001', 'name'])            # Single value by labels
print(df.loc['emp001':'emp003'])           # Slice by index labels (inclusive!)

# iloc: Position-based selection (uses integer positions)
print("\n=== iloc Examples ===")
print(df.iloc[0])                          # First row by position
print(df.iloc[0, 0])                       # First value by position
print(df.iloc[0:3])                        # Slice by position (exclusive end)

Advanced Selection Patterns

# Multiple rows and columns with loc
selected_employees = df.loc[['emp001', 'emp003', 'emp005'], ['name', 'salary']]
print("Selected employees and salaries:")
print(selected_employees)

# Multiple rows and columns with iloc
selected_positions = df.iloc[[0, 2, 4], [0, 2]]  # First, third, fifth rows; first and third columns
print("\nSame selection using positions:")
print(selected_positions)

# Boolean indexing with loc
high_earners = df.loc[df['salary'] > 60000]
print("\nHigh earners (salary > 60000):")
print(high_earners)

# Complex boolean conditions
it_high_earners = df.loc[(df['department'] == 'IT') & (df['salary'] > 50000)]
print("\nIT employees earning > 50000:")
print(it_high_earners)

Conditional Selection Mastery

# Query-style selection with loc
young_employees = df.loc[df['age'] < 30, ['name', 'age', 'department']]
print("Young employees:")
print(young_employees)

# Combine conditions and select specific columns
complex_selection = df.loc[
    (df['age'] >= 30) & (df['department'].isin(['IT', 'Engineering'])),
    ['name', 'salary']
]
print("\nSenior IT/Engineering employees:")
print(complex_selection)

# Using iloc with boolean arrays (converted to positions)
age_mask = df['age'] > 30
positions = np.where(age_mask)[0]  # Get positions where condition is True
senior_employees = df.iloc[positions]
print("\nSenior employees using iloc:")
print(senior_employees)

Data Modification with loc and iloc

# Modify single values
df_modified = df.copy()

# Using loc to modify by label
df_modified.loc['emp001', 'salary'] = 55000
print(f"Alice's new salary: {df_modified.loc['emp001', 'salary']}")

# Using iloc to modify by position
df_modified.iloc[1, 2] = 65000  # Bob's salary (row 1, column 2)
print(f"Bob's new salary: {df_modified.iloc[1, 2]}")

# Bulk modifications
df_modified.loc[df_modified['department'] == 'HR', 'salary'] *= 1.1  # 10% raise for HR
print("\nAfter HR salary increase:")
print(df_modified[df_modified['department'] == 'HR'])

# Add new column with conditional values
df_modified.loc[:, 'bonus'] = 0  # Initialize new column
df_modified.loc[df_modified['salary'] > 60000, 'bonus'] = 5000
print("\nWith bonus column:")
print(df_modified)

Time Series and DateTime Indexing

# Create time series data
dates = pd.date_range('2025-01-01', periods=100, freq='D')
ts_data = pd.DataFrame({
    'price': np.random.randn(100).cumsum() + 100,
    'volume': np.random.randint(1000, 10000, 100)
}, index=dates)

# Time-based selection with loc
january_data = ts_data.loc['2025-01-01':'2025-01-31']
print(f"January data shape: {january_data.shape}")

# Specific date selection
specific_day = ts_data.loc['2025-01-15']
print(f"Data for Jan 15: {specific_day}")

# Recent data using iloc
recent_10_days = ts_data.iloc[-10:]  # Last 10 rows
print("Recent 10 days:")
print(recent_10_days.head())

# Every 5th day using iloc
every_5th = ts_data.iloc[::5]  # Every 5th row
print(f"Every 5th day (shape): {every_5th.shape}")

Performance Considerations

import time

# Create large DataFrame for performance testing
large_df = pd.DataFrame({
    'A': np.random.randn(100000),
    'B': np.random.randn(100000),
    'C': np.random.randn(100000)
})

# Performance comparison: loc vs iloc
# Select first 1000 rows
start = time.time()
loc_result = large_df.loc[:999]  # Label-based (if index is 0,1,2...)
loc_time = time.time() - start

start = time.time()
iloc_result = large_df.iloc[:1000]  # Position-based
iloc_time = time.time() - start

print(f"loc time: {loc_time:.6f}s")
print(f"iloc time: {iloc_time:.6f}s")
print(f"Results are equal: {loc_result.equals(iloc_result)}")

# Boolean indexing performance
start = time.time()
condition_result = large_df.loc[large_df['A'] > 0]
condition_time = time.time() - start
print(f"Boolean indexing time: {condition_time:.6f}s")

Common Pitfalls and Solutions

# Pitfall 1: Forgetting loc/iloc are inclusive/exclusive differently
df_sample = pd.DataFrame({'values': range(10)})

print("loc slice [2:5] (inclusive end):")
print(df_sample.loc[2:5])  # Includes index 5

print("\niloc slice [2:5] (exclusive end):")
print(df_sample.iloc[2:5])  # Excludes index 5

# Pitfall 2: Chain indexing (causes SettingWithCopyWarning)
# Bad: df[df['age'] > 30]['salary'] = new_value  # Chain indexing
# Good: Use loc
df_fixed = df.copy()
df_fixed.loc[df_fixed['age'] > 30, 'salary'] = 70000

# Pitfall 3: Mixed index types
mixed_index_df = pd.DataFrame({'data': [1, 2, 3, 4, 5]}, 
                             index=['a', 1, 'c', 2, 'e'])

# Use loc for label-based even with mixed types
print("Mixed index selection:")
print(mixed_index_df.loc['a'])  # String label
print(mixed_index_df.loc[1])    # Integer label

# Use iloc for position-based regardless of index type
print(mixed_index_df.iloc[0])   # First position
print(mixed_index_df.iloc[1])   # Second position

Advanced Selection Techniques

# Multi-level indexing
multi_df = pd.DataFrame({
    'sales': [100, 150, 200, 120, 180, 160],
    'costs': [60, 90, 120, 70, 100, 95]
}, index=pd.MultiIndex.from_tuples([
    ('Q1', 'Jan'), ('Q1', 'Feb'), ('Q1', 'Mar'),
    ('Q2', 'Apr'), ('Q2', 'May'), ('Q2', 'Jun')
], names=['Quarter', 'Month']))

# Select Q1 data
q1_data = multi_df.loc['Q1']
print("Q1 Data:")
print(q1_data)

# Select specific month across quarters
# feb_data = multi_df.loc[(slice(None), 'Feb'), :]  # All quarters, Feb only

# Callable selection (advanced)
def high_sales_selector(df):
    return df['sales'] > df['sales'].median()

high_sales = df.loc[high_sales_selector]
print("\nHigh sales employees:")
print(high_sales)

Best Practices Summary

  • Use loc for label-based selection and when working with boolean conditions
  • Use iloc for position-based selection and when you need specific row/column positions
  • Avoid chained indexing - use loc/iloc for modifications
  • loc slices are inclusive, iloc slices are exclusive
  • loc is generally more readable for data analysis tasks

Master Advanced Pandas

Dive into advanced pandas operations, explore multi-index DataFrames, and learn performance optimization techniques.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python