Navigation

Python

How to Use query() Method for Data Filtering

Transform complex boolean indexing into readable, SQL-like queries with pandas' powerful query() method - your gateway to intuitive data filtering.

Table Of Contents

Query Like a Database Pro

Tired of complex boolean expressions cluttering your code? Pandas' query() method brings SQL-like syntax to DataFrame filtering, making your data exploration both readable and efficient.

Query Method Fundamentals

import pandas as pd
import numpy as np

# Create sample dataset
employees = pd.DataFrame({
    'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson'],
    'age': [28, 35, 42, 31, 29],
    'salary': [75000, 85000, 95000, 70000, 80000],
    'department': ['Engineering', 'Marketing', 'Engineering', 'HR', 'Marketing'],
    'experience': [3, 8, 15, 5, 4],
    'bonus': [5000, 7000, 12000, 4000, 6000],
    'city': ['NYC', 'LA', 'Chicago', 'NYC', 'SF']
})

print("Original employee data:")
print(employees)

# Basic query examples
print("\n=== Basic Query Examples ===")

# Simple condition
high_earners = employees.query('salary > 80000')
print("\nEmployees earning > $80,000:")
print(high_earners[['name', 'salary']])

# Multiple conditions with AND
senior_engineers = employees.query('department == "Engineering" and experience > 10')
print("\nSenior engineers:")
print(senior_engineers[['name', 'experience', 'department']])

# Multiple conditions with OR
young_or_experienced = employees.query('age < 30 or experience > 10')
print("\nYoung or very experienced employees:")
print(young_or_experienced[['name', 'age', 'experience']])

Advanced Query Expressions

# Complex logical combinations
complex_query = employees.query(
    '(salary > 75000 and age < 35) or (department == "Engineering" and experience > 5)'
)
print("\nComplex query result:")
print(complex_query[['name', 'salary', 'age', 'department', 'experience']])

# Using mathematical operations in queries
efficiency_query = employees.query('salary / experience > 15000')
print("\nHigh salary-to-experience ratio:")
print(efficiency_query[['name', 'salary', 'experience']])

# String operations in queries
name_query = employees.query('name.str.contains("o")', engine='python')
print("\nEmployees with 'o' in their name:")
print(name_query[['name']])

# Range queries
mid_career = employees.query('5 <= experience <= 10')
print("\nMid-career employees (5-10 years experience):")
print(mid_career[['name', 'experience']])

# Negation queries
not_engineering = employees.query('department != "Engineering"')
print("\nNon-engineering employees:")
print(not_engineering[['name', 'department']])

Variable Substitution in Queries

# Using variables in queries
min_salary = 80000
target_department = 'Marketing'
max_age = 35

# Method 1: Using @ symbol for variables
variable_query1 = employees.query('salary >= @min_salary and department == @target_department')
print(f"\n{target_department} employees earning >= ${min_salary:,}:")
print(variable_query1[['name', 'salary', 'department']])

# Method 2: Using f-strings (be careful with injection)
variable_query2 = employees.query(f'age <= {max_age} and salary > {min_salary}')
print(f"\nEmployees aged <= {max_age} with salary > ${min_salary:,}:")
print(variable_query2[['name', 'age', 'salary']])

# Using lists with isin()
target_cities = ['NYC', 'SF']
city_query = employees.query('city in @target_cities')
print(f"\nEmployees in {target_cities}:")
print(city_query[['name', 'city']])

# Dynamic query building
def build_employee_query(min_salary=None, departments=None, max_age=None):
    """Build dynamic query based on parameters"""
    conditions = []
    
    if min_salary:
        conditions.append(f'salary >= {min_salary}')
    
    if departments:
        dept_list = "', '".join(departments)
        conditions.append(f"department in ['{dept_list}']")
    
    if max_age:
        conditions.append(f'age <= {max_age}')
    
    if not conditions:
        return employees  # Return all if no conditions
    
    query_string = ' and '.join(conditions)
    print(f"Generated query: {query_string}")
    return employees.query(query_string)

# Test dynamic query
dynamic_result = build_employee_query(min_salary=75000, departments=['Engineering', 'Marketing'])
print("\nDynamic query result:")
print(dynamic_result[['name', 'salary', 'department']])

Performance Comparison: query() vs Boolean Indexing

import time

# Create larger dataset for performance testing
np.random.seed(42)
large_employees = pd.DataFrame({
    'employee_id': range(100000),
    'age': np.random.randint(22, 65, 100000),
    'salary': np.random.randint(40000, 150000, 100000),
    'department': np.random.choice(['Engineering', 'Marketing', 'HR', 'Finance'], 100000),
    'experience': np.random.randint(0, 30, 100000),
    'performance_score': np.random.uniform(1, 5, 100000)
})

print(f"Large dataset created: {len(large_employees):,} rows")

# Performance test: Complex filtering
def performance_test():
    # Method 1: Traditional boolean indexing
    start_time = time.time()
    bool_result = large_employees[
        (large_employees['salary'] > 80000) & 
        (large_employees['age'] < 40) & 
        (large_employees['experience'] > 5) &
        (large_employees['department'].isin(['Engineering', 'Marketing']))
    ]
    bool_time = time.time() - start_time
    
    # Method 2: query() method
    start_time = time.time()
    query_result = large_employees.query(
        'salary > 80000 and age < 40 and experience > 5 and department in ["Engineering", "Marketing"]'
    )
    query_time = time.time() - start_time
    
    print(f"\nPerformance comparison:")
    print(f"Boolean indexing: {bool_time:.4f}s, {len(bool_result):,} results")
    print(f"Query method: {query_time:.4f}s, {len(query_result):,} results")
    print(f"Results identical: {len(bool_result) == len(query_result)}")
    
    return bool_time, query_time

bool_time, query_time = performance_test()

Real-World Business Queries

# Business analytics with query()
sales_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=1000, freq='D'),
    'salesperson': np.random.choice(['Alice', 'Bob', 'Charlie', 'Diana'], 1000),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 1000),
    'product': np.random.choice(['Laptop', 'Phone', 'Tablet'], 1000),
    'revenue': np.random.randint(1000, 10000, 1000),
    'units_sold': np.random.randint(1, 50, 1000),
    'customer_type': np.random.choice(['Enterprise', 'SMB', 'Individual'], 1000)
})

# Add calculated fields
sales_data['month'] = sales_data['date'].dt.month
sales_data['quarter'] = sales_data['date'].dt.quarter
sales_data['revenue_per_unit'] = sales_data['revenue'] / sales_data['units_sold']

print("Sample sales data:")
print(sales_data.head())

# Business intelligence queries
print("\n=== Business Intelligence Queries ===")

# Q1: High-performing sales in Q1
q1_high_performance = sales_data.query(
    'quarter == 1 and revenue > 5000 and units_sold > 10'
)
print(f"\nQ1 high-performance sales: {len(q1_high_performance)} records")
print(q1_high_performance.groupby('salesperson')['revenue'].sum().sort_values(ascending=False))

# Q2: Enterprise deals with high unit value
enterprise_premium = sales_data.query(
    'customer_type == "Enterprise" and revenue_per_unit > 200'
)
print(f"\nPremium enterprise deals: {len(enterprise_premium)} records")
print(enterprise_premium.groupby('product')['revenue'].agg(['count', 'sum', 'mean']))

# Q3: Regional performance analysis
def analyze_regional_performance(region_name, min_revenue=3000):
    """Analyze performance for a specific region"""
    
    region_query = sales_data.query(
        f'region == "{region_name}" and revenue >= {min_revenue}'
    )
    
    if len(region_query) == 0:
        return f"No high-performance sales found in {region_name}"
    
    analysis = {
        'total_sales': len(region_query),
        'total_revenue': region_query['revenue'].sum(),
        'avg_revenue': region_query['revenue'].mean(),
        'top_salesperson': region_query.groupby('salesperson')['revenue'].sum().idxmax(),
        'best_product': region_query.groupby('product')['revenue'].sum().idxmax()
    }
    
    return analysis

# Analyze each region
for region in ['North', 'South', 'East', 'West']:
    analysis = analyze_regional_performance(region)
    print(f"\n{region} Region Analysis:")
    if isinstance(analysis, dict):
        for key, value in analysis.items():
            print(f"  {key}: {value}")
    else:
        print(f"  {analysis}")

Query with Time Series Data

# Time-based queries
time_series_data = sales_data.copy()
time_series_data['day_of_week'] = time_series_data['date'].dt.day_name()
time_series_data['is_weekend'] = time_series_data['date'].dt.weekday >= 5

# Time-based filtering
recent_data = time_series_data.query('date >= "2024-11-01"')
print(f"\nRecent data (Nov 2024 onwards): {len(recent_data)} records")

# Weekend vs weekday analysis
weekend_analysis = time_series_data.query('is_weekend == True').groupby('product')['revenue'].mean()
weekday_analysis = time_series_data.query('is_weekend == False').groupby('product')['revenue'].mean()

comparison = pd.DataFrame({
    'weekend_avg': weekend_analysis,
    'weekday_avg': weekday_analysis
})
comparison['weekend_premium'] = ((comparison['weekend_avg'] / comparison['weekday_avg']) - 1) * 100

print("\nWeekend vs Weekday Revenue Analysis:")
print(comparison.round(2))

# Seasonal patterns
seasonal_query = time_series_data.query(
    'month in [6, 7, 8] and customer_type == "Individual"'  # Summer individual sales
)
print(f"\nSummer individual sales: {len(seasonal_query)} records")
print("Average revenue by product:")
print(seasonal_query.groupby('product')['revenue'].mean().sort_values(ascending=False))

Advanced Query Techniques

# Quantile-based queries
revenue_quantiles = sales_data['revenue'].quantile([0.25, 0.5, 0.75, 0.9])
print("Revenue quantiles:")
print(revenue_quantiles)

# Top and bottom performers
top_10_percent = sales_data.query(f'revenue >= {revenue_quantiles[0.9]}')
bottom_25_percent = sales_data.query(f'revenue <= {revenue_quantiles[0.25]}')

print(f"\nTop 10% sales: {len(top_10_percent)} records")
print(f"Bottom 25% sales: {len(bottom_25_percent)} records")

# Outlier detection with query
def detect_outliers(df, column, method='iqr'):
    """Detect outliers using query method"""
    
    if method == 'iqr':
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df.query(f'{column} < {lower_bound} or {column} > {upper_bound}')
        return outliers
    
    elif method == 'zscore':
        mean_val = df[column].mean()
        std_val = df[column].std()
        threshold = 3
        
        outliers = df.query(f'abs(({column} - {mean_val}) / {std_val}) > {threshold}')
        return outliers

# Detect revenue outliers
revenue_outliers = detect_outliers(sales_data, 'revenue', 'iqr')
print(f"\nRevenue outliers detected: {len(revenue_outliers)}")
print(revenue_outliers[['salesperson', 'product', 'revenue']].head())

# Complex aggregation with query
def complex_business_analysis():
    """Perform complex business analysis using queries"""
    
    # High-value enterprise deals by experienced salespeople
    premium_deals = sales_data.query(
        'customer_type == "Enterprise" and revenue > 7000'
    )
    
    # Cross-sell opportunities (customers buying multiple product types)
    cross_sell_customers = sales_data.query(
        'customer_type in ["Enterprise", "SMB"]'
    ).groupby(['date', 'customer_type']).agg({
        'product': 'nunique',
        'revenue': 'sum'
    }).query('product > 1')  # Multiple products in same transaction
    
    print(f"Premium enterprise deals: {len(premium_deals)}")
    print(f"Cross-sell opportunities: {len(cross_sell_customers)}")
    
    return premium_deals, cross_sell_customers

premium_deals, cross_sell_ops = complex_business_analysis()

Query Best Practices and Tips

def query_best_practices():
    """Best practices for using pandas query method"""
    
    practices = {
        'Readability': [
            "Use meaningful variable names with @",
            "Break complex queries into multiple steps",
            "Use parentheses to clarify precedence",
            "Consider multiline strings for very complex queries"
        ],
        
        'Performance': [
            "query() can be faster for complex conditions",
            "Use numexpr engine for numerical operations",
            "Index your DataFrames when doing repeated queries",
            "Consider caching results of expensive queries"
        ],
        
        'Safety': [
            "Always use @ for external variables",
            "Validate column names exist before querying",
            "Handle edge cases (empty results, NaN values)",
            "Be careful with string operations and special characters"
        ],
        
        'Common Pitfalls': [
            "Don't use 'and'/'or' with boolean Series (use & and |)",
            "String operations require engine='python'",
            "Column names with spaces need backticks",
            "NaN values need special handling (use notna())"
        ]
    }
    
    print("=== QUERY METHOD BEST PRACTICES ===")
    for category, tips in practices.items():
        print(f"\n{category}:")
        for tip in tips:
            print(f"  ✅ {tip}")

query_best_practices()

# Example of handling edge cases
def safe_query(df, query_string, default_value=None):
    """Safely execute query with error handling"""
    try:
        result = df.query(query_string)
        if len(result) == 0:
            print(f"Warning: Query '{query_string}' returned no results")
            return pd.DataFrame() if default_value is None else default_value
        return result
    except Exception as e:
        print(f"Error executing query '{query_string}': {e}")
        return pd.DataFrame() if default_value is None else default_value

# Test safe query
safe_result = safe_query(employees, 'nonexistent_column > 100')
print(f"\nSafe query result shape: {safe_result.shape}")

Master Advanced Data Filtering

Explore advanced pandas indexing techniques, learn SQL-like operations in pandas, and discover performance optimization strategies.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python