Ojasa Mirai

Ojasa Mirai

Python

Loading...

Learning Level

🟢 Beginner🔵 Advanced
Data Processing OverviewCSV Data HandlingPandas BasicsDataFramesData FilteringAggregation & GroupingData Cleaning & WranglingNumPy ArraysData Visualization Basics
Python/Data Processing/Aggregation Grouping

📊 Advanced Aggregation & Grouping — Complex Data Summarization

Master rolling windows, expanding operations, and sophisticated grouping patterns.


🎯 Rolling Window Operations

import pandas as pd
import numpy as np

# Time series data
df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=20),
    'price': np.random.randint(100, 150, 20)
})

# Rolling mean
df['rolling_mean_5'] = df['price'].rolling(window=5).mean()

# Rolling sum
df['rolling_sum_3'] = df['price'].rolling(window=3).sum()

# Multiple rolling calculations at once
rolling = df['price'].rolling(window=5).agg(['mean', 'std', 'min', 'max'])

# Custom rolling function
def price_range(x):
    return x.max() - x.min()

df['rolling_range'] = df['price'].rolling(window=5).apply(price_range)

# Center the window
df['centered_mean'] = df['price'].rolling(window=5, center=True).mean()

# Min periods (require N non-null values)
df['rolling_mean_strict'] = df['price'].rolling(window=5, min_periods=5).mean()

📈 Expanding Window Operations

import pandas as pd

df = pd.DataFrame({
    'value': [10, 20, 15, 25, 30],
    'date': pd.date_range('2024-01-01', periods=5)
})

# Expanding mean (cumulative average)
df['expanding_mean'] = df['value'].expanding().mean()

# Expanding sum (cumulative sum)
df['expanding_sum'] = df['value'].expanding().sum()

# Expanding max (running maximum)
df['expanding_max'] = df['value'].expanding().max()

# Multiple expanding operations
df_expanded = df['value'].expanding().agg(['mean', 'std', 'count'])

🔄 Groupby with Rolling Windows

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'B'],
    'value': [10, 20, 15, 30, 25, 35]
})

# Rolling within groups
df['rolling_mean'] = df.groupby('group')['value'].rolling(window=2).mean().reset_index(0, drop=True)

# Expanding within groups
df['expanding_sum'] = df.groupby('group')['value'].expanding().sum().reset_index(0, drop=True)

# Rolling with multiple functions
rolling_stats = df.groupby('group')['value'].rolling(window=2).agg(['mean', 'sum']).reset_index(0, drop=True)

🎯 Custom Aggregation Functions

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'C', 'C'],
    'value': [10, 20, 15, 25, 30, 35]
})

# Define custom aggregation
def custom_agg(x):
    return {
        'sum': x.sum(),
        'mean': x.mean(),
        'std': x.std(),
        'coefficient_of_variation': x.std() / x.mean()
    }

result = df.groupby('group')['value'].apply(custom_agg).unstack()

# Multiple custom functions
def percentile_25(x):
    return x.quantile(0.25)

def percentile_75(x):
    return x.quantile(0.75)

result = df.groupby('group')['value'].agg([
    'mean',
    ('p25', percentile_25),
    ('p75', percentile_75),
    ('iqr', lambda x: x.quantile(0.75) - x.quantile(0.25))
])

🔀 Multi-Level Grouping

import pandas as pd

df = pd.DataFrame({
    'year': [2023, 2023, 2023, 2024, 2024, 2024],
    'region': ['North', 'South', 'North', 'South', 'North', 'South'],
    'sales': [100, 150, 120, 200, 180, 210]
})

# Group by multiple columns
result = df.groupby(['year', 'region'])['sales'].sum()

# Named aggregations
result = df.groupby(['year', 'region']).agg(
    total=('sales', 'sum'),
    average=('sales', 'mean'),
    count=('sales', 'count')
)

# Transform: add group values as new column
df['year_region_avg'] = df.groupby(['year', 'region'])['sales'].transform('mean')

# Rank within groups
df['rank'] = df.groupby('year')['sales'].rank(ascending=False)

📊 Time-Based Aggregation

import pandas as pd

df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=31),
    'value': range(1, 32)
})

df.set_index('date', inplace=True)

# Resample to weekly
weekly = df.resample('W').sum()

# Resample to monthly with multiple aggregations
monthly = df.resample('M').agg({
    'value': ['sum', 'mean', 'std']
})

# Resample with custom aggregation
quarterly = df.resample('Q').agg({
    'value': lambda x: x.nlargest(3).sum()  # Sum of top 3 values
})

# Resample with label and closed position
weekly = df.resample('W', label='left', closed='left').sum()

🎨 GroupBy Filter

import pandas as pd

df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'C', 'C'],
    'value': [10, 20, 15, 25, 30, 5]
})

# Keep only groups with sum > 30
result = df.groupby('group').filter(lambda x: x['value'].sum() > 30)

# Keep groups with mean > 20
result = df.groupby('group').filter(lambda x: x['value'].mean() > 20)

# Keep groups with more than 1 element where value > 15
result = df.groupby('group').filter(
    lambda x: (len(x) > 1) & (x['value'] > 15).any()
)

🎯 Pivoting with Aggregation

import pandas as pd

df = pd.DataFrame({
    'date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02'],
    'product': ['A', 'B', 'A', 'B'],
    'quantity': [10, 20, 15, 25],
    'price': [100, 50, 100, 50]
})

# Pivot with multiple aggregations
pivot = df.pivot_table(
    values=['quantity', 'price'],
    index='date',
    columns='product',
    aggfunc={'quantity': 'sum', 'price': 'mean'},
    fill_value=0
)

# Margins (totals)
pivot_margins = df.pivot_table(
    values='quantity',
    index='date',
    columns='product',
    aggfunc='sum',
    margins=True,
    margins_name='Total'
)

📊 Nlargest and Nsmallest

import pandas as pd

df = pd.DataFrame({
    'group': ['A', 'A', 'A', 'B', 'B', 'B'],
    'value': [100, 150, 120, 200, 180, 210]
})

# Top N per group
top_per_group = df.groupby('group').apply(
    lambda x: x.nlargest(2, 'value')
).reset_index(drop=True)

# Bottom N per group
bottom_per_group = df.groupby('group').apply(
    lambda x: x.nsmallest(1, 'value')
).reset_index(drop=True)

🔑 Key Takeaways

  • ✅ Use rolling() for moving averages and statistics
  • ✅ Use expanding() for cumulative operations
  • ✅ Combine groupby with rolling for time series analysis
  • ✅ Use resample() for time-based aggregation
  • ✅ Use filter() to select groups meeting criteria
  • ✅ Use transform() to add group-level calculations to original rows
  • ✅ Use pivot_table() with margins for cross-tabulation

Continue: Advanced Data Cleaning | Advanced NumPy


Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

PythonFastapiReactJSCloud

© 2026 Ojasa Mirai. All rights reserved.

TwitterGitHubLinkedIn