
Python
Master complex filtering strategies, query optimization, and conditional operations.
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Carol', 'David'],
'age': [25, 30, 28, 35],
'salary': [50000, 60000, 55000, 70000],
'department': ['Sales', 'IT', 'Sales', 'IT']
})
# Complex condition with query
result = df.query('age > 27 and salary > 55000')
# Using variables in query
min_age = 26
result = df.query('age > @min_age')
# Multiple conditions
result = df.query('(age > 25 and salary > 55000) or department == "IT"')
# Query with string methods
result = df.query('name.str.startswith("A").eq(True)', engine='python')import pandas as pd
import re
df = pd.DataFrame({
'email': ['alice@example.com', 'bob@test.org', 'carol@example.com'],
'text': ['Hello World', 'Hello there', 'Hi everyone']
})
# String contains (case-sensitive)
result = df[df['email'].str.contains('@example.com', na=False)]
# Regex filtering
result = df[df['email'].str.match(r'.*@example\.com$')]
# Complex regex
emails = df['email']
valid_emails = emails[emails.str.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')]
# String length filtering
result = df[df['text'].str.len() > 10]
# Multiple string conditions
result = df[
(df['email'].str.contains('@example', na=False)) &
(df['text'].str.contains('Hello', na=False))
]import pandas as pd
import numpy as np
df = pd.DataFrame({
'id': range(1, 11),
'value': np.random.randint(1, 100, 10)
})
# Simple range
result = df[(df['value'] >= 20) & (df['value'] <= 80)]
# Quantile-based filtering
q1, q3 = df['value'].quantile([0.25, 0.75])
iqr = q3 - q1
# Remove outliers (values beyond 1.5*IQR)
result = df[
(df['value'] >= q1 - 1.5*iqr) &
(df['value'] <= q3 + 1.5*iqr)
]
# Z-score filtering
from scipy.stats import zscore
z_scores = np.abs(zscore(df['value']))
result = df[z_scores < 3]
# Percentile filtering
percentile_5 = df['value'].quantile(0.05)
percentile_95 = df['value'].quantile(0.95)
result = df[(df['value'] >= percentile_5) & (df['value'] <= percentile_95)]import pandas as pd
import numpy as np
df = pd.DataFrame({
'score': [45, 75, 85, 65, 95],
'attendance': [80, 90, 100, 70, 100]
})
# Simple conditional assignment
df['grade'] = np.where(df['score'] >= 80, 'A', 'B')
# Multiple conditions
df['grade'] = np.select(
[df['score'] >= 90, df['score'] >= 80, df['score'] >= 70],
['A', 'B', 'C'],
default='F'
)
# Using case-when logic
df['status'] = 'Pass'
df.loc[(df['score'] < 70) & (df['attendance'] < 75), 'status'] = 'At Risk'
df.loc[df['score'] < 60, 'status'] = 'Fail'
# Using pd.cut for binning
df['score_band'] = pd.cut(
df['score'],
bins=[0, 50, 70, 90, 100],
labels=['F', 'C', 'B', 'A']
)import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Carol', 'David'],
'email': ['alice@test.com', 'invalid', 'carol@test.com', 'david@test.com']
})
# Custom validation function
def is_valid_email(email):
import re
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
# Apply filter
df_valid = df[df['email'].apply(is_valid_email)]
# Custom filter for multiple columns
def is_good_record(row):
return len(row['name']) > 3 and '@' in row['email']
df_filtered = df[df.apply(is_good_record, axis=1)]import pandas as pd
# Time series data
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=30),
'value': range(30)
})
# Set index to datetime
df.set_index('date', inplace=True)
# Filter by date range
result = df['2024-01-10':'2024-01-20']
# Filter by month
result = df[df.index.month == 1]
# Filter by specific days
result = df[df.index.day > 15]
# Filter by business days only
result = df[df.index.weekday < 5]
# Last N days
last_7_days = df[df.index >= df.index.max() - pd.Timedelta(days=7)]import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, np.nan, 3, np.nan, 5],
'B': [10, 20, np.nan, 40, 50],
'C': [100, 200, 300, 400, 500]
})
# Filter rows with no nulls
df_no_nulls = df.dropna()
# Filter rows with any null in specific columns
df_complete = df.dropna(subset=['A', 'B'])
# Filter rows with null in specific column
df_with_null_A = df[df['A'].isna()]
# Filter rows with threshold of non-nulls
df_threshold = df.dropna(thresh=2) # Keep if at least 2 non-null values
# Custom null handling
df['A_filled'] = df['A'].fillna(df['A'].mean())
df_after_fill = df[df['A_filled'].notnull()]import pandas as pd
df = pd.DataFrame({
'product': ['A', 'B', 'C', 'D', 'E'],
'region': ['North', 'South', 'East', 'West', 'North'],
'sales': [100, 150, 120, 180, 110]
})
# Filter by list of values
target_products = ['A', 'C', 'E']
result = df[df['product'].isin(target_products)]
# Inverse: NOT in list
result = df[~df['product'].isin(target_products)]
# Multiple column membership
target_pairs = [('A', 'North'), ('C', 'East')]
result = df[
df.apply(lambda row: (row['product'], row['region']) in target_pairs, axis=1)
]
# Category-based filtering
regions_of_interest = ['North', 'South']
result = df.groupby('region').filter(lambda x: x.name in regions_of_interest)import pandas as pd
df = pd.DataFrame({
'value': [10, 20, 30, 40, 50]
}, index=['a', 'b', 'c', 'd', 'e'])
# Filter by index value
result = df.loc[['a', 'c', 'e']]
# Filter by index range
result = df.loc['b':'d']
# Filter by index position
result = df.iloc[1:4]
# Filter by index condition
result = df[df.index.isin(['a', 'c', 'e'])]
# Filter using index name
df.index.name = 'letter'
result = df[df.index.str.in(['a', 'b'])]import pandas as pd
import numpy as np
# Customer transactions
df = pd.DataFrame({
'customer_id': [1, 1, 2, 2, 3, 3, 4, 4],
'date': pd.date_range('2024-01-01', periods=8),
'amount': [100, 150, 200, 50, 75, 120, 300, 25],
'category': ['Electronics', 'Books', 'Electronics', 'Food', 'Books', 'Electronics', 'Electronics', 'Food'],
'status': ['completed', 'completed', 'completed', 'cancelled', 'completed', 'completed', 'completed', 'refunded']
})
# High-value completed transactions in last 7 days
result = df[
(df['amount'] > 100) &
(df['status'] == 'completed') &
(df['date'] >= df['date'].max() - pd.Timedelta(days=7))
]
# Electronics purchases, not cancelled/refunded
result = df[
(df['category'] == 'Electronics') &
(~df['status'].isin(['cancelled', 'refunded']))
]
# Customers with average transaction > 100
high_value_customers = df.groupby('customer_id')['amount'].mean() > 100
result = df[df['customer_id'].isin(high_value_customers[high_value_customers].index)]Continue: Advanced Aggregation | Advanced Data Cleaning
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward