How to Use query() Method for Data Filtering

Query Like a Database Pro
Query Method Fundamentals
Advanced Query Expressions
Variable Substitution in Queries
Performance Comparison: query() vs Boolean Indexing
Real-World Business Queries
Query with Time Series Data
Advanced Query Techniques
Query Best Practices and Tips
Master Advanced Data Filtering

Query Like a Database Pro

Tired of complex boolean expressions cluttering your code? Pandas' query() method brings SQL-like syntax to DataFrame filtering, making your data exploration both readable and efficient.

Query Method Fundamentals

import pandas as pd
import numpy as np

# Create sample dataset
employees = pd.DataFrame({
    'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson'],
    'age': [28, 35, 42, 31, 29],
    'salary': [75000, 85000, 95000, 70000, 80000],
    'department': ['Engineering', 'Marketing', 'Engineering', 'HR', 'Marketing'],
    'experience': [3, 8, 15, 5, 4],
    'bonus': [5000, 7000, 12000, 4000, 6000],
    'city': ['NYC', 'LA', 'Chicago', 'NYC', 'SF']
})

print("Original employee data:")
print(employees)

# Basic query examples
print("\n=== Basic Query Examples ===")

# Simple condition
high_earners = employees.query('salary > 80000')
print("\nEmployees earning > $80,000:")
print(high_earners[['name', 'salary']])

# Multiple conditions with AND
senior_engineers = employees.query('department == "Engineering" and experience > 10')
print("\nSenior engineers:")
print(senior_engineers[['name', 'experience', 'department']])

# Multiple conditions with OR
young_or_experienced = employees.query('age < 30 or experience > 10')
print("\nYoung or very experienced employees:")
print(young_or_experienced[['name', 'age', 'experience']])

Advanced Query Expressions

# Complex logical combinations
complex_query = employees.query(
    '(salary > 75000 and age < 35) or (department == "Engineering" and experience > 5)'
)
print("\nComplex query result:")
print(complex_query[['name', 'salary', 'age', 'department', 'experience']])

# Using mathematical operations in queries
efficiency_query = employees.query('salary / experience > 15000')
print("\nHigh salary-to-experience ratio:")
print(efficiency_query[['name', 'salary', 'experience']])

# String operations in queries
name_query = employees.query('name.str.contains("o")', engine='python')
print("\nEmployees with 'o' in their name:")
print(name_query[['name']])

# Range queries
mid_career = employees.query('5 <= experience <= 10')
print("\nMid-career employees (5-10 years experience):")
print(mid_career[['name', 'experience']])

# Negation queries
not_engineering = employees.query('department != "Engineering"')
print("\nNon-engineering employees:")
print(not_engineering[['name', 'department']])

Variable Substitution in Queries

# Using variables in queries
min_salary = 80000
target_department = 'Marketing'
max_age = 35

# Method 1: Using @ symbol for variables
variable_query1 = employees.query('salary >= @min_salary and department == @target_department')
print(f"\n{target_department} employees earning >= ${min_salary:,}:")
print(variable_query1[['name', 'salary', 'department']])

# Method 2: Using f-strings (be careful with injection)
variable_query2 = employees.query(f'age <= {max_age} and salary > {min_salary}')
print(f"\nEmployees aged <= {max_age} with salary > ${min_salary:,}:")
print(variable_query2[['name', 'age', 'salary']])

# Using lists with isin()
target_cities = ['NYC', 'SF']
city_query = employees.query('city in @target_cities')
print(f"\nEmployees in {target_cities}:")
print(city_query[['name', 'city']])

# Dynamic query building
def build_employee_query(min_salary=None, departments=None, max_age=None):
    """Build dynamic query based on parameters"""
    conditions = []
    
    if min_salary:
        conditions.append(f'salary >= {min_salary}')
    
    if departments:
        dept_list = "', '".join(departments)
        conditions.append(f"department in ['{dept_list}']")
    
    if max_age:
        conditions.append(f'age <= {max_age}')
    
    if not conditions:
        return employees  # Return all if no conditions
    
    query_string = ' and '.join(conditions)
    print(f"Generated query: {query_string}")
    return employees.query(query_string)

# Test dynamic query
dynamic_result = build_employee_query(min_salary=75000, departments=['Engineering', 'Marketing'])
print("\nDynamic query result:")
print(dynamic_result[['name', 'salary', 'department']])

Performance Comparison: query() vs Boolean Indexing

import time

# Create larger dataset for performance testing
np.random.seed(42)
large_employees = pd.DataFrame({
    'employee_id': range(100000),
    'age': np.random.randint(22, 65, 100000),
    'salary': np.random.randint(40000, 150000, 100000),
    'department': np.random.choice(['Engineering', 'Marketing', 'HR', 'Finance'], 100000),
    'experience': np.random.randint(0, 30, 100000),
    'performance_score': np.random.uniform(1, 5, 100000)
})

print(f"Large dataset created: {len(large_employees):,} rows")

# Performance test: Complex filtering
def performance_test():
    # Method 1: Traditional boolean indexing
    start_time = time.time()
    bool_result = large_employees[
        (large_employees['salary'] > 80000) & 
        (large_employees['age'] < 40) & 
        (large_employees['experience'] > 5) &
        (large_employees['department'].isin(['Engineering', 'Marketing']))
    ]
    bool_time = time.time() - start_time
    
    # Method 2: query() method
    start_time = time.time()
    query_result = large_employees.query(
        'salary > 80000 and age < 40 and experience > 5 and department in ["Engineering", "Marketing"]'
    )
    query_time = time.time() - start_time
    
    print(f"\nPerformance comparison:")
    print(f"Boolean indexing: {bool_time:.4f}s, {len(bool_result):,} results")
    print(f"Query method: {query_time:.4f}s, {len(query_result):,} results")
    print(f"Results identical: {len(bool_result) == len(query_result)}")
    
    return bool_time, query_time

bool_time, query_time = performance_test()

Real-World Business Queries

# Business analytics with query()
sales_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=1000, freq='D'),
    'salesperson': np.random.choice(['Alice', 'Bob', 'Charlie', 'Diana'], 1000),
    'region': np.random.choice(['North', 'South', 'East', 'West'], 1000),
    'product': np.random.choice(['Laptop', 'Phone', 'Tablet'], 1000),
    'revenue': np.random.randint(1000, 10000, 1000),
    'units_sold': np.random.randint(1, 50, 1000),
    'customer_type': np.random.choice(['Enterprise', 'SMB', 'Individual'], 1000)
})

# Add calculated fields
sales_data['month'] = sales_data['date'].dt.month
sales_data['quarter'] = sales_data['date'].dt.quarter
sales_data['revenue_per_unit'] = sales_data['revenue'] / sales_data['units_sold']

print("Sample sales data:")
print(sales_data.head())

# Business intelligence queries
print("\n=== Business Intelligence Queries ===")

# Q1: High-performing sales in Q1
q1_high_performance = sales_data.query(
    'quarter == 1 and revenue > 5000 and units_sold > 10'
)
print(f"\nQ1 high-performance sales: {len(q1_high_performance)} records")
print(q1_high_performance.groupby('salesperson')['revenue'].sum().sort_values(ascending=False))

# Q2: Enterprise deals with high unit value
enterprise_premium = sales_data.query(
    'customer_type == "Enterprise" and revenue_per_unit > 200'
)
print(f"\nPremium enterprise deals: {len(enterprise_premium)} records")
print(enterprise_premium.groupby('product')['revenue'].agg(['count', 'sum', 'mean']))

# Q3: Regional performance analysis
def analyze_regional_performance(region_name, min_revenue=3000):
    """Analyze performance for a specific region"""
    
    region_query = sales_data.query(
        f'region == "{region_name}" and revenue >= {min_revenue}'
    )
    
    if len(region_query) == 0:
        return f"No high-performance sales found in {region_name}"
    
    analysis = {
        'total_sales': len(region_query),
        'total_revenue': region_query['revenue'].sum(),
        'avg_revenue': region_query['revenue'].mean(),
        'top_salesperson': region_query.groupby('salesperson')['revenue'].sum().idxmax(),
        'best_product': region_query.groupby('product')['revenue'].sum().idxmax()
    }
    
    return analysis

# Analyze each region
for region in ['North', 'South', 'East', 'West']:
    analysis = analyze_regional_performance(region)
    print(f"\n{region} Region Analysis:")
    if isinstance(analysis, dict):
        for key, value in analysis.items():
            print(f"  {key}: {value}")
    else:
        print(f"  {analysis}")

Query with Time Series Data

# Time-based queries
time_series_data = sales_data.copy()
time_series_data['day_of_week'] = time_series_data['date'].dt.day_name()
time_series_data['is_weekend'] = time_series_data['date'].dt.weekday >= 5

# Time-based filtering
recent_data = time_series_data.query('date >= "2024-11-01"')
print(f"\nRecent data (Nov 2024 onwards): {len(recent_data)} records")

# Weekend vs weekday analysis
weekend_analysis = time_series_data.query('is_weekend == True').groupby('product')['revenue'].mean()
weekday_analysis = time_series_data.query('is_weekend == False').groupby('product')['revenue'].mean()

comparison = pd.DataFrame({
    'weekend_avg': weekend_analysis,
    'weekday_avg': weekday_analysis
})
comparison['weekend_premium'] = ((comparison['weekend_avg'] / comparison['weekday_avg']) - 1) * 100

print("\nWeekend vs Weekday Revenue Analysis:")
print(comparison.round(2))

# Seasonal patterns
seasonal_query = time_series_data.query(
    'month in [6, 7, 8] and customer_type == "Individual"'  # Summer individual sales
)
print(f"\nSummer individual sales: {len(seasonal_query)} records")
print("Average revenue by product:")
print(seasonal_query.groupby('product')['revenue'].mean().sort_values(ascending=False))

Advanced Query Techniques

# Quantile-based queries
revenue_quantiles = sales_data['revenue'].quantile([0.25, 0.5, 0.75, 0.9])
print("Revenue quantiles:")
print(revenue_quantiles)

# Top and bottom performers
top_10_percent = sales_data.query(f'revenue >= {revenue_quantiles[0.9]}')
bottom_25_percent = sales_data.query(f'revenue <= {revenue_quantiles[0.25]}')

print(f"\nTop 10% sales: {len(top_10_percent)} records")
print(f"Bottom 25% sales: {len(bottom_25_percent)} records")

# Outlier detection with query
def detect_outliers(df, column, method='iqr'):
    """Detect outliers using query method"""
    
    if method == 'iqr':
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df.query(f'{column} < {lower_bound} or {column} > {upper_bound}')
        return outliers
    
    elif method == 'zscore':
        mean_val = df[column].mean()
        std_val = df[column].std()
        threshold = 3
        
        outliers = df.query(f'abs(({column} - {mean_val}) / {std_val}) > {threshold}')
        return outliers

# Detect revenue outliers
revenue_outliers = detect_outliers(sales_data, 'revenue', 'iqr')
print(f"\nRevenue outliers detected: {len(revenue_outliers)}")
print(revenue_outliers[['salesperson', 'product', 'revenue']].head())

# Complex aggregation with query
def complex_business_analysis():
    """Perform complex business analysis using queries"""
    
    # High-value enterprise deals by experienced salespeople
    premium_deals = sales_data.query(
        'customer_type == "Enterprise" and revenue > 7000'
    )
    
    # Cross-sell opportunities (customers buying multiple product types)
    cross_sell_customers = sales_data.query(
        'customer_type in ["Enterprise", "SMB"]'
    ).groupby(['date', 'customer_type']).agg({
        'product': 'nunique',
        'revenue': 'sum'
    }).query('product > 1')  # Multiple products in same transaction
    
    print(f"Premium enterprise deals: {len(premium_deals)}")
    print(f"Cross-sell opportunities: {len(cross_sell_customers)}")
    
    return premium_deals, cross_sell_customers

premium_deals, cross_sell_ops = complex_business_analysis()

Query Best Practices and Tips

def query_best_practices():
    """Best practices for using pandas query method"""
    
    practices = {
        'Readability': [
            "Use meaningful variable names with @",
            "Break complex queries into multiple steps",
            "Use parentheses to clarify precedence",
            "Consider multiline strings for very complex queries"
        ],
        
        'Performance': [
            "query() can be faster for complex conditions",
            "Use numexpr engine for numerical operations",
            "Index your DataFrames when doing repeated queries",
            "Consider caching results of expensive queries"
        ],
        
        'Safety': [
            "Always use @ for external variables",
            "Validate column names exist before querying",
            "Handle edge cases (empty results, NaN values)",
            "Be careful with string operations and special characters"
        ],
        
        'Common Pitfalls': [
            "Don't use 'and'/'or' with boolean Series (use & and |)",
            "String operations require engine='python'",
            "Column names with spaces need backticks",
            "NaN values need special handling (use notna())"
        ]
    }
    
    print("=== QUERY METHOD BEST PRACTICES ===")
    for category, tips in practices.items():
        print(f"\n{category}:")
        for tip in tips:
            print(f"  ✅ {tip}")

query_best_practices()

# Example of handling edge cases
def safe_query(df, query_string, default_value=None):
    """Safely execute query with error handling"""
    try:
        result = df.query(query_string)
        if len(result) == 0:
            print(f"Warning: Query '{query_string}' returned no results")
            return pd.DataFrame() if default_value is None else default_value
        return result
    except Exception as e:
        print(f"Error executing query '{query_string}': {e}")
        return pd.DataFrame() if default_value is None else default_value

# Test safe query
safe_result = safe_query(employees, 'nonexistent_column > 100')
print(f"\nSafe query result shape: {safe_result.shape}")

Master Advanced Data Filtering

Explore advanced pandas indexing techniques, learn SQL-like operations in pandas, and discover performance optimization strategies.

Share this article

Navigation

How to Use query() Method for Data Filtering

Table Of Contents

Query Like a Database Pro

Query Method Fundamentals

Advanced Query Expressions

Variable Substitution in Queries

Performance Comparison: query() vs Boolean Indexing

Real-World Business Queries

Query with Time Series Data

Advanced Query Techniques

Query Best Practices and Tips

Master Advanced Data Filtering

Add Comment

More from Python

Navigation

Table Of Contents

Query Like a Database Pro

Query Method Fundamentals

Advanced Query Expressions

Variable Substitution in Queries

Performance Comparison: query() vs Boolean Indexing

Real-World Business Queries

Query with Time Series Data

Advanced Query Techniques

Query Best Practices and Tips

Master Advanced Data Filtering

Comments

Add Comment

More from Python

Master Python Unpacking: Complete Guide to * and ** Operators in 2025

How to Reshape NumPy Arrays with reshape()

How to Use NumPy where() for Conditional Selection

How to Group Data with groupby() and agg()

Python Function Factories and Parameterized Decorators: Advanced Design Patterns

Python Lambda Functions: When to Use Them (and When Not To) [2025 Guide]