Ojasa Mirai

Ojasa Mirai

Python

Loading...

Learning Level

🟢 Beginner🔵 Advanced
Data Processing OverviewCSV Data HandlingPandas BasicsDataFramesData FilteringAggregation & GroupingData Cleaning & WranglingNumPy ArraysData Visualization Basics
Python/Data Processing/Data Filtering

🔍 Advanced Data Filtering — Sophisticated Selection Techniques

Master complex filtering strategies, query optimization, and conditional operations.


🎯 Query Method for Complex Conditions

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Carol', 'David'],
    'age': [25, 30, 28, 35],
    'salary': [50000, 60000, 55000, 70000],
    'department': ['Sales', 'IT', 'Sales', 'IT']
})

# Complex condition with query
result = df.query('age > 27 and salary > 55000')

# Using variables in query
min_age = 26
result = df.query('age > @min_age')

# Multiple conditions
result = df.query('(age > 25 and salary > 55000) or department == "IT"')

# Query with string methods
result = df.query('name.str.startswith("A").eq(True)', engine='python')

🔤 String and Text Filtering

import pandas as pd
import re

df = pd.DataFrame({
    'email': ['alice@example.com', 'bob@test.org', 'carol@example.com'],
    'text': ['Hello World', 'Hello there', 'Hi everyone']
})

# String contains (case-sensitive)
result = df[df['email'].str.contains('@example.com', na=False)]

# Regex filtering
result = df[df['email'].str.match(r'.*@example\.com$')]

# Complex regex
emails = df['email']
valid_emails = emails[emails.str.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')]

# String length filtering
result = df[df['text'].str.len() > 10]

# Multiple string conditions
result = df[
    (df['email'].str.contains('@example', na=False)) &
    (df['text'].str.contains('Hello', na=False))
]

📊 Numerical Range Filtering

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'id': range(1, 11),
    'value': np.random.randint(1, 100, 10)
})

# Simple range
result = df[(df['value'] >= 20) & (df['value'] <= 80)]

# Quantile-based filtering
q1, q3 = df['value'].quantile([0.25, 0.75])
iqr = q3 - q1

# Remove outliers (values beyond 1.5*IQR)
result = df[
    (df['value'] >= q1 - 1.5*iqr) &
    (df['value'] <= q3 + 1.5*iqr)
]

# Z-score filtering
from scipy.stats import zscore
z_scores = np.abs(zscore(df['value']))
result = df[z_scores < 3]

# Percentile filtering
percentile_5 = df['value'].quantile(0.05)
percentile_95 = df['value'].quantile(0.95)
result = df[(df['value'] >= percentile_5) & (df['value'] <= percentile_95)]

🎭 Conditional Assignment

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'score': [45, 75, 85, 65, 95],
    'attendance': [80, 90, 100, 70, 100]
})

# Simple conditional assignment
df['grade'] = np.where(df['score'] >= 80, 'A', 'B')

# Multiple conditions
df['grade'] = np.select(
    [df['score'] >= 90, df['score'] >= 80, df['score'] >= 70],
    ['A', 'B', 'C'],
    default='F'
)

# Using case-when logic
df['status'] = 'Pass'
df.loc[(df['score'] < 70) & (df['attendance'] < 75), 'status'] = 'At Risk'
df.loc[df['score'] < 60, 'status'] = 'Fail'

# Using pd.cut for binning
df['score_band'] = pd.cut(
    df['score'],
    bins=[0, 50, 70, 90, 100],
    labels=['F', 'C', 'B', 'A']
)

🎯 Filtering with Custom Functions

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Carol', 'David'],
    'email': ['alice@test.com', 'invalid', 'carol@test.com', 'david@test.com']
})

# Custom validation function
def is_valid_email(email):
    import re
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

# Apply filter
df_valid = df[df['email'].apply(is_valid_email)]

# Custom filter for multiple columns
def is_good_record(row):
    return len(row['name']) > 3 and '@' in row['email']

df_filtered = df[df.apply(is_good_record, axis=1)]

🔀 Temporal Filtering

import pandas as pd

# Time series data
df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=30),
    'value': range(30)
})

# Set index to datetime
df.set_index('date', inplace=True)

# Filter by date range
result = df['2024-01-10':'2024-01-20']

# Filter by month
result = df[df.index.month == 1]

# Filter by specific days
result = df[df.index.day > 15]

# Filter by business days only
result = df[df.index.weekday < 5]

# Last N days
last_7_days = df[df.index >= df.index.max() - pd.Timedelta(days=7)]

🔐 Null/NaN Filtering

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [1, np.nan, 3, np.nan, 5],
    'B': [10, 20, np.nan, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

# Filter rows with no nulls
df_no_nulls = df.dropna()

# Filter rows with any null in specific columns
df_complete = df.dropna(subset=['A', 'B'])

# Filter rows with null in specific column
df_with_null_A = df[df['A'].isna()]

# Filter rows with threshold of non-nulls
df_threshold = df.dropna(thresh=2)  # Keep if at least 2 non-null values

# Custom null handling
df['A_filled'] = df['A'].fillna(df['A'].mean())
df_after_fill = df[df['A_filled'].notnull()]

📊 Membership and Category Filtering

import pandas as pd

df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'D', 'E'],
    'region': ['North', 'South', 'East', 'West', 'North'],
    'sales': [100, 150, 120, 180, 110]
})

# Filter by list of values
target_products = ['A', 'C', 'E']
result = df[df['product'].isin(target_products)]

# Inverse: NOT in list
result = df[~df['product'].isin(target_products)]

# Multiple column membership
target_pairs = [('A', 'North'), ('C', 'East')]
result = df[
    df.apply(lambda row: (row['product'], row['region']) in target_pairs, axis=1)
]

# Category-based filtering
regions_of_interest = ['North', 'South']
result = df.groupby('region').filter(lambda x: x.name in regions_of_interest)

🎯 Index-Based Filtering

import pandas as pd

df = pd.DataFrame({
    'value': [10, 20, 30, 40, 50]
}, index=['a', 'b', 'c', 'd', 'e'])

# Filter by index value
result = df.loc[['a', 'c', 'e']]

# Filter by index range
result = df.loc['b':'d']

# Filter by index position
result = df.iloc[1:4]

# Filter by index condition
result = df[df.index.isin(['a', 'c', 'e'])]

# Filter using index name
df.index.name = 'letter'
result = df[df.index.str.in(['a', 'b'])]

🎨 Real-World Example: E-commerce Filtering

import pandas as pd
import numpy as np

# Customer transactions
df = pd.DataFrame({
    'customer_id': [1, 1, 2, 2, 3, 3, 4, 4],
    'date': pd.date_range('2024-01-01', periods=8),
    'amount': [100, 150, 200, 50, 75, 120, 300, 25],
    'category': ['Electronics', 'Books', 'Electronics', 'Food', 'Books', 'Electronics', 'Electronics', 'Food'],
    'status': ['completed', 'completed', 'completed', 'cancelled', 'completed', 'completed', 'completed', 'refunded']
})

# High-value completed transactions in last 7 days
result = df[
    (df['amount'] > 100) &
    (df['status'] == 'completed') &
    (df['date'] >= df['date'].max() - pd.Timedelta(days=7))
]

# Electronics purchases, not cancelled/refunded
result = df[
    (df['category'] == 'Electronics') &
    (~df['status'].isin(['cancelled', 'refunded']))
]

# Customers with average transaction > 100
high_value_customers = df.groupby('customer_id')['amount'].mean() > 100
result = df[df['customer_id'].isin(high_value_customers[high_value_customers].index)]

🔑 Key Takeaways

  • ✅ Use `query()` for readable complex conditions
  • ✅ Use regex for sophisticated string matching
  • ✅ Use `pd.cut()` for binning continuous values
  • ✅ Use `np.select()` for multiple conditions
  • ✅ Filter by temporal patterns with date indexing
  • ✅ Use `isin()` for membership testing
  • ✅ Chain conditions with `&`, `|`, `~` operators

Continue: Advanced Aggregation | Advanced Data Cleaning


Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

PythonFastapiReactJSCloud

© 2026 Ojasa Mirai. All rights reserved.

TwitterGitHubLinkedIn