Table Of Contents
- Query Like a Database Pro
- Query Method Fundamentals
- Advanced Query Expressions
- Variable Substitution in Queries
- Performance Comparison: query() vs Boolean Indexing
- Real-World Business Queries
- Query with Time Series Data
- Advanced Query Techniques
- Query Best Practices and Tips
- Master Advanced Data Filtering
Query Like a Database Pro
Tired of complex boolean expressions cluttering your code? Pandas' query() method brings SQL-like syntax to DataFrame filtering, making your data exploration both readable and efficient.
Query Method Fundamentals
import pandas as pd
import numpy as np
# Create sample dataset
employees = pd.DataFrame({
'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson'],
'age': [28, 35, 42, 31, 29],
'salary': [75000, 85000, 95000, 70000, 80000],
'department': ['Engineering', 'Marketing', 'Engineering', 'HR', 'Marketing'],
'experience': [3, 8, 15, 5, 4],
'bonus': [5000, 7000, 12000, 4000, 6000],
'city': ['NYC', 'LA', 'Chicago', 'NYC', 'SF']
})
print("Original employee data:")
print(employees)
# Basic query examples
print("\n=== Basic Query Examples ===")
# Simple condition
high_earners = employees.query('salary > 80000')
print("\nEmployees earning > $80,000:")
print(high_earners[['name', 'salary']])
# Multiple conditions with AND
senior_engineers = employees.query('department == "Engineering" and experience > 10')
print("\nSenior engineers:")
print(senior_engineers[['name', 'experience', 'department']])
# Multiple conditions with OR
young_or_experienced = employees.query('age < 30 or experience > 10')
print("\nYoung or very experienced employees:")
print(young_or_experienced[['name', 'age', 'experience']])
Advanced Query Expressions
# Complex logical combinations
complex_query = employees.query(
'(salary > 75000 and age < 35) or (department == "Engineering" and experience > 5)'
)
print("\nComplex query result:")
print(complex_query[['name', 'salary', 'age', 'department', 'experience']])
# Using mathematical operations in queries
efficiency_query = employees.query('salary / experience > 15000')
print("\nHigh salary-to-experience ratio:")
print(efficiency_query[['name', 'salary', 'experience']])
# String operations in queries
name_query = employees.query('name.str.contains("o")', engine='python')
print("\nEmployees with 'o' in their name:")
print(name_query[['name']])
# Range queries
mid_career = employees.query('5 <= experience <= 10')
print("\nMid-career employees (5-10 years experience):")
print(mid_career[['name', 'experience']])
# Negation queries
not_engineering = employees.query('department != "Engineering"')
print("\nNon-engineering employees:")
print(not_engineering[['name', 'department']])
Variable Substitution in Queries
# Using variables in queries
min_salary = 80000
target_department = 'Marketing'
max_age = 35
# Method 1: Using @ symbol for variables
variable_query1 = employees.query('salary >= @min_salary and department == @target_department')
print(f"\n{target_department} employees earning >= ${min_salary:,}:")
print(variable_query1[['name', 'salary', 'department']])
# Method 2: Using f-strings (be careful with injection)
variable_query2 = employees.query(f'age <= {max_age} and salary > {min_salary}')
print(f"\nEmployees aged <= {max_age} with salary > ${min_salary:,}:")
print(variable_query2[['name', 'age', 'salary']])
# Using lists with isin()
target_cities = ['NYC', 'SF']
city_query = employees.query('city in @target_cities')
print(f"\nEmployees in {target_cities}:")
print(city_query[['name', 'city']])
# Dynamic query building
def build_employee_query(min_salary=None, departments=None, max_age=None):
"""Build dynamic query based on parameters"""
conditions = []
if min_salary:
conditions.append(f'salary >= {min_salary}')
if departments:
dept_list = "', '".join(departments)
conditions.append(f"department in ['{dept_list}']")
if max_age:
conditions.append(f'age <= {max_age}')
if not conditions:
return employees # Return all if no conditions
query_string = ' and '.join(conditions)
print(f"Generated query: {query_string}")
return employees.query(query_string)
# Test dynamic query
dynamic_result = build_employee_query(min_salary=75000, departments=['Engineering', 'Marketing'])
print("\nDynamic query result:")
print(dynamic_result[['name', 'salary', 'department']])
Performance Comparison: query() vs Boolean Indexing
import time
# Create larger dataset for performance testing
np.random.seed(42)
large_employees = pd.DataFrame({
'employee_id': range(100000),
'age': np.random.randint(22, 65, 100000),
'salary': np.random.randint(40000, 150000, 100000),
'department': np.random.choice(['Engineering', 'Marketing', 'HR', 'Finance'], 100000),
'experience': np.random.randint(0, 30, 100000),
'performance_score': np.random.uniform(1, 5, 100000)
})
print(f"Large dataset created: {len(large_employees):,} rows")
# Performance test: Complex filtering
def performance_test():
# Method 1: Traditional boolean indexing
start_time = time.time()
bool_result = large_employees[
(large_employees['salary'] > 80000) &
(large_employees['age'] < 40) &
(large_employees['experience'] > 5) &
(large_employees['department'].isin(['Engineering', 'Marketing']))
]
bool_time = time.time() - start_time
# Method 2: query() method
start_time = time.time()
query_result = large_employees.query(
'salary > 80000 and age < 40 and experience > 5 and department in ["Engineering", "Marketing"]'
)
query_time = time.time() - start_time
print(f"\nPerformance comparison:")
print(f"Boolean indexing: {bool_time:.4f}s, {len(bool_result):,} results")
print(f"Query method: {query_time:.4f}s, {len(query_result):,} results")
print(f"Results identical: {len(bool_result) == len(query_result)}")
return bool_time, query_time
bool_time, query_time = performance_test()
Real-World Business Queries
# Business analytics with query()
sales_data = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=1000, freq='D'),
'salesperson': np.random.choice(['Alice', 'Bob', 'Charlie', 'Diana'], 1000),
'region': np.random.choice(['North', 'South', 'East', 'West'], 1000),
'product': np.random.choice(['Laptop', 'Phone', 'Tablet'], 1000),
'revenue': np.random.randint(1000, 10000, 1000),
'units_sold': np.random.randint(1, 50, 1000),
'customer_type': np.random.choice(['Enterprise', 'SMB', 'Individual'], 1000)
})
# Add calculated fields
sales_data['month'] = sales_data['date'].dt.month
sales_data['quarter'] = sales_data['date'].dt.quarter
sales_data['revenue_per_unit'] = sales_data['revenue'] / sales_data['units_sold']
print("Sample sales data:")
print(sales_data.head())
# Business intelligence queries
print("\n=== Business Intelligence Queries ===")
# Q1: High-performing sales in Q1
q1_high_performance = sales_data.query(
'quarter == 1 and revenue > 5000 and units_sold > 10'
)
print(f"\nQ1 high-performance sales: {len(q1_high_performance)} records")
print(q1_high_performance.groupby('salesperson')['revenue'].sum().sort_values(ascending=False))
# Q2: Enterprise deals with high unit value
enterprise_premium = sales_data.query(
'customer_type == "Enterprise" and revenue_per_unit > 200'
)
print(f"\nPremium enterprise deals: {len(enterprise_premium)} records")
print(enterprise_premium.groupby('product')['revenue'].agg(['count', 'sum', 'mean']))
# Q3: Regional performance analysis
def analyze_regional_performance(region_name, min_revenue=3000):
"""Analyze performance for a specific region"""
region_query = sales_data.query(
f'region == "{region_name}" and revenue >= {min_revenue}'
)
if len(region_query) == 0:
return f"No high-performance sales found in {region_name}"
analysis = {
'total_sales': len(region_query),
'total_revenue': region_query['revenue'].sum(),
'avg_revenue': region_query['revenue'].mean(),
'top_salesperson': region_query.groupby('salesperson')['revenue'].sum().idxmax(),
'best_product': region_query.groupby('product')['revenue'].sum().idxmax()
}
return analysis
# Analyze each region
for region in ['North', 'South', 'East', 'West']:
analysis = analyze_regional_performance(region)
print(f"\n{region} Region Analysis:")
if isinstance(analysis, dict):
for key, value in analysis.items():
print(f" {key}: {value}")
else:
print(f" {analysis}")
Query with Time Series Data
# Time-based queries
time_series_data = sales_data.copy()
time_series_data['day_of_week'] = time_series_data['date'].dt.day_name()
time_series_data['is_weekend'] = time_series_data['date'].dt.weekday >= 5
# Time-based filtering
recent_data = time_series_data.query('date >= "2024-11-01"')
print(f"\nRecent data (Nov 2024 onwards): {len(recent_data)} records")
# Weekend vs weekday analysis
weekend_analysis = time_series_data.query('is_weekend == True').groupby('product')['revenue'].mean()
weekday_analysis = time_series_data.query('is_weekend == False').groupby('product')['revenue'].mean()
comparison = pd.DataFrame({
'weekend_avg': weekend_analysis,
'weekday_avg': weekday_analysis
})
comparison['weekend_premium'] = ((comparison['weekend_avg'] / comparison['weekday_avg']) - 1) * 100
print("\nWeekend vs Weekday Revenue Analysis:")
print(comparison.round(2))
# Seasonal patterns
seasonal_query = time_series_data.query(
'month in [6, 7, 8] and customer_type == "Individual"' # Summer individual sales
)
print(f"\nSummer individual sales: {len(seasonal_query)} records")
print("Average revenue by product:")
print(seasonal_query.groupby('product')['revenue'].mean().sort_values(ascending=False))
Advanced Query Techniques
# Quantile-based queries
revenue_quantiles = sales_data['revenue'].quantile([0.25, 0.5, 0.75, 0.9])
print("Revenue quantiles:")
print(revenue_quantiles)
# Top and bottom performers
top_10_percent = sales_data.query(f'revenue >= {revenue_quantiles[0.9]}')
bottom_25_percent = sales_data.query(f'revenue <= {revenue_quantiles[0.25]}')
print(f"\nTop 10% sales: {len(top_10_percent)} records")
print(f"Bottom 25% sales: {len(bottom_25_percent)} records")
# Outlier detection with query
def detect_outliers(df, column, method='iqr'):
"""Detect outliers using query method"""
if method == 'iqr':
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df.query(f'{column} < {lower_bound} or {column} > {upper_bound}')
return outliers
elif method == 'zscore':
mean_val = df[column].mean()
std_val = df[column].std()
threshold = 3
outliers = df.query(f'abs(({column} - {mean_val}) / {std_val}) > {threshold}')
return outliers
# Detect revenue outliers
revenue_outliers = detect_outliers(sales_data, 'revenue', 'iqr')
print(f"\nRevenue outliers detected: {len(revenue_outliers)}")
print(revenue_outliers[['salesperson', 'product', 'revenue']].head())
# Complex aggregation with query
def complex_business_analysis():
"""Perform complex business analysis using queries"""
# High-value enterprise deals by experienced salespeople
premium_deals = sales_data.query(
'customer_type == "Enterprise" and revenue > 7000'
)
# Cross-sell opportunities (customers buying multiple product types)
cross_sell_customers = sales_data.query(
'customer_type in ["Enterprise", "SMB"]'
).groupby(['date', 'customer_type']).agg({
'product': 'nunique',
'revenue': 'sum'
}).query('product > 1') # Multiple products in same transaction
print(f"Premium enterprise deals: {len(premium_deals)}")
print(f"Cross-sell opportunities: {len(cross_sell_customers)}")
return premium_deals, cross_sell_customers
premium_deals, cross_sell_ops = complex_business_analysis()
Query Best Practices and Tips
def query_best_practices():
"""Best practices for using pandas query method"""
practices = {
'Readability': [
"Use meaningful variable names with @",
"Break complex queries into multiple steps",
"Use parentheses to clarify precedence",
"Consider multiline strings for very complex queries"
],
'Performance': [
"query() can be faster for complex conditions",
"Use numexpr engine for numerical operations",
"Index your DataFrames when doing repeated queries",
"Consider caching results of expensive queries"
],
'Safety': [
"Always use @ for external variables",
"Validate column names exist before querying",
"Handle edge cases (empty results, NaN values)",
"Be careful with string operations and special characters"
],
'Common Pitfalls': [
"Don't use 'and'/'or' with boolean Series (use & and |)",
"String operations require engine='python'",
"Column names with spaces need backticks",
"NaN values need special handling (use notna())"
]
}
print("=== QUERY METHOD BEST PRACTICES ===")
for category, tips in practices.items():
print(f"\n{category}:")
for tip in tips:
print(f" ✅ {tip}")
query_best_practices()
# Example of handling edge cases
def safe_query(df, query_string, default_value=None):
"""Safely execute query with error handling"""
try:
result = df.query(query_string)
if len(result) == 0:
print(f"Warning: Query '{query_string}' returned no results")
return pd.DataFrame() if default_value is None else default_value
return result
except Exception as e:
print(f"Error executing query '{query_string}': {e}")
return pd.DataFrame() if default_value is None else default_value
# Test safe query
safe_result = safe_query(employees, 'nonexistent_column > 100')
print(f"\nSafe query result shape: {safe_result.shape}")
Master Advanced Data Filtering
Explore advanced pandas indexing techniques, learn SQL-like operations in pandas, and discover performance optimization strategies.
Share this article
Add Comment
No comments yet. Be the first to comment!