Table Of Contents
- Precision in Data Selection
- The Fundamental Difference
- Advanced Selection Patterns
- Conditional Selection Mastery
- Data Modification with loc and iloc
- Time Series and DateTime Indexing
- Performance Considerations
- Common Pitfalls and Solutions
- Advanced Selection Techniques
- Best Practices Summary
- Master Advanced Pandas
Precision in Data Selection
Stop fumbling with data selection. Understanding loc and iloc transforms you from a pandas novice to a data manipulation expert with surgical precision.
The Fundamental Difference
import pandas as pd
import numpy as np
# Sample DataFrame
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
'age': [25, 30, 35, 28, 32],
'salary': [50000, 60000, 75000, 55000, 68000],
'department': ['IT', 'HR', 'Engineering', 'IT', 'HR']
}, index=['emp001', 'emp002', 'emp003', 'emp004', 'emp005'])
print(df)
# loc: Label-based selection (uses index/column names)
print("\n=== loc Examples ===")
print(df.loc['emp001']) # Single row by index label
print(df.loc['emp001', 'name']) # Single value by labels
print(df.loc['emp001':'emp003']) # Slice by index labels (inclusive!)
# iloc: Position-based selection (uses integer positions)
print("\n=== iloc Examples ===")
print(df.iloc[0]) # First row by position
print(df.iloc[0, 0]) # First value by position
print(df.iloc[0:3]) # Slice by position (exclusive end)
Advanced Selection Patterns
# Multiple rows and columns with loc
selected_employees = df.loc[['emp001', 'emp003', 'emp005'], ['name', 'salary']]
print("Selected employees and salaries:")
print(selected_employees)
# Multiple rows and columns with iloc
selected_positions = df.iloc[[0, 2, 4], [0, 2]] # First, third, fifth rows; first and third columns
print("\nSame selection using positions:")
print(selected_positions)
# Boolean indexing with loc
high_earners = df.loc[df['salary'] > 60000]
print("\nHigh earners (salary > 60000):")
print(high_earners)
# Complex boolean conditions
it_high_earners = df.loc[(df['department'] == 'IT') & (df['salary'] > 50000)]
print("\nIT employees earning > 50000:")
print(it_high_earners)
Conditional Selection Mastery
# Query-style selection with loc
young_employees = df.loc[df['age'] < 30, ['name', 'age', 'department']]
print("Young employees:")
print(young_employees)
# Combine conditions and select specific columns
complex_selection = df.loc[
(df['age'] >= 30) & (df['department'].isin(['IT', 'Engineering'])),
['name', 'salary']
]
print("\nSenior IT/Engineering employees:")
print(complex_selection)
# Using iloc with boolean arrays (converted to positions)
age_mask = df['age'] > 30
positions = np.where(age_mask)[0] # Get positions where condition is True
senior_employees = df.iloc[positions]
print("\nSenior employees using iloc:")
print(senior_employees)
Data Modification with loc and iloc
# Modify single values
df_modified = df.copy()
# Using loc to modify by label
df_modified.loc['emp001', 'salary'] = 55000
print(f"Alice's new salary: {df_modified.loc['emp001', 'salary']}")
# Using iloc to modify by position
df_modified.iloc[1, 2] = 65000 # Bob's salary (row 1, column 2)
print(f"Bob's new salary: {df_modified.iloc[1, 2]}")
# Bulk modifications
df_modified.loc[df_modified['department'] == 'HR', 'salary'] *= 1.1 # 10% raise for HR
print("\nAfter HR salary increase:")
print(df_modified[df_modified['department'] == 'HR'])
# Add new column with conditional values
df_modified.loc[:, 'bonus'] = 0 # Initialize new column
df_modified.loc[df_modified['salary'] > 60000, 'bonus'] = 5000
print("\nWith bonus column:")
print(df_modified)
Time Series and DateTime Indexing
# Create time series data
dates = pd.date_range('2025-01-01', periods=100, freq='D')
ts_data = pd.DataFrame({
'price': np.random.randn(100).cumsum() + 100,
'volume': np.random.randint(1000, 10000, 100)
}, index=dates)
# Time-based selection with loc
january_data = ts_data.loc['2025-01-01':'2025-01-31']
print(f"January data shape: {january_data.shape}")
# Specific date selection
specific_day = ts_data.loc['2025-01-15']
print(f"Data for Jan 15: {specific_day}")
# Recent data using iloc
recent_10_days = ts_data.iloc[-10:] # Last 10 rows
print("Recent 10 days:")
print(recent_10_days.head())
# Every 5th day using iloc
every_5th = ts_data.iloc[::5] # Every 5th row
print(f"Every 5th day (shape): {every_5th.shape}")
Performance Considerations
import time
# Create large DataFrame for performance testing
large_df = pd.DataFrame({
'A': np.random.randn(100000),
'B': np.random.randn(100000),
'C': np.random.randn(100000)
})
# Performance comparison: loc vs iloc
# Select first 1000 rows
start = time.time()
loc_result = large_df.loc[:999] # Label-based (if index is 0,1,2...)
loc_time = time.time() - start
start = time.time()
iloc_result = large_df.iloc[:1000] # Position-based
iloc_time = time.time() - start
print(f"loc time: {loc_time:.6f}s")
print(f"iloc time: {iloc_time:.6f}s")
print(f"Results are equal: {loc_result.equals(iloc_result)}")
# Boolean indexing performance
start = time.time()
condition_result = large_df.loc[large_df['A'] > 0]
condition_time = time.time() - start
print(f"Boolean indexing time: {condition_time:.6f}s")
Common Pitfalls and Solutions
# Pitfall 1: Forgetting loc/iloc are inclusive/exclusive differently
df_sample = pd.DataFrame({'values': range(10)})
print("loc slice [2:5] (inclusive end):")
print(df_sample.loc[2:5]) # Includes index 5
print("\niloc slice [2:5] (exclusive end):")
print(df_sample.iloc[2:5]) # Excludes index 5
# Pitfall 2: Chain indexing (causes SettingWithCopyWarning)
# Bad: df[df['age'] > 30]['salary'] = new_value # Chain indexing
# Good: Use loc
df_fixed = df.copy()
df_fixed.loc[df_fixed['age'] > 30, 'salary'] = 70000
# Pitfall 3: Mixed index types
mixed_index_df = pd.DataFrame({'data': [1, 2, 3, 4, 5]},
index=['a', 1, 'c', 2, 'e'])
# Use loc for label-based even with mixed types
print("Mixed index selection:")
print(mixed_index_df.loc['a']) # String label
print(mixed_index_df.loc[1]) # Integer label
# Use iloc for position-based regardless of index type
print(mixed_index_df.iloc[0]) # First position
print(mixed_index_df.iloc[1]) # Second position
Advanced Selection Techniques
# Multi-level indexing
multi_df = pd.DataFrame({
'sales': [100, 150, 200, 120, 180, 160],
'costs': [60, 90, 120, 70, 100, 95]
}, index=pd.MultiIndex.from_tuples([
('Q1', 'Jan'), ('Q1', 'Feb'), ('Q1', 'Mar'),
('Q2', 'Apr'), ('Q2', 'May'), ('Q2', 'Jun')
], names=['Quarter', 'Month']))
# Select Q1 data
q1_data = multi_df.loc['Q1']
print("Q1 Data:")
print(q1_data)
# Select specific month across quarters
# feb_data = multi_df.loc[(slice(None), 'Feb'), :] # All quarters, Feb only
# Callable selection (advanced)
def high_sales_selector(df):
return df['sales'] > df['sales'].median()
high_sales = df.loc[high_sales_selector]
print("\nHigh sales employees:")
print(high_sales)
Best Practices Summary
- Use loc for label-based selection and when working with boolean conditions
- Use iloc for position-based selection and when you need specific row/column positions
- Avoid chained indexing - use loc/iloc for modifications
- loc slices are inclusive, iloc slices are exclusive
- loc is generally more readable for data analysis tasks
Master Advanced Pandas
Dive into advanced pandas operations, explore multi-index DataFrames, and learn performance optimization techniques.
Share this article
Add Comment
No comments yet. Be the first to comment!