
Python
Master grouping and aggregation to extract meaningful statistics from your data.
The `groupby()` method groups data by one or more columns, then applies operations to each group.
import pandas as pd
df = pd.DataFrame({
'department': ['Sales', 'IT', 'Sales', 'IT', 'HR'],
'name': ['Alice', 'Bob', 'Carol', 'David', 'Eve'],
'salary': [50000, 60000, 55000, 70000, 45000]
})
# Group by department
grouped = df.groupby('department')
# Get average salary per department
avg_salary = grouped['salary'].mean()
print(avg_salary)
# department
# HR 45000
# IT 65000
# Sales 52500import pandas as pd
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B', 'C'],
'value': [10, 15, 20, 25, 30]
})
grouped = df.groupby('category')
# Sum
print(grouped['value'].sum()) # Total per category
# Count
print(grouped['value'].count()) # Number of items
# Mean
print(grouped['value'].mean()) # Average
# Min/Max
print(grouped['value'].min()) # Minimum
print(grouped['value'].max()) # Maximum
# Std Dev
print(grouped['value'].std()) # Standard deviation
# Multiple aggregations
print(grouped['value'].agg(['sum', 'mean', 'count']))import pandas as pd
df = pd.DataFrame({
'year': [2023, 2023, 2023, 2024, 2024, 2024],
'quarter': ['Q1', 'Q2', 'Q1', 'Q2', 'Q1', 'Q2'],
'sales': [100, 150, 120, 200, 180, 210]
})
# Group by year and quarter
grouped = df.groupby(['year', 'quarter'])['sales'].sum()
print(grouped)
# More readable output
summary = df.groupby(['year', 'quarter']).agg({'sales': ['sum', 'mean']})
print(summary)import pandas as pd
df = pd.DataFrame({
'department': ['Sales', 'IT', 'Sales', 'IT'],
'name': ['Alice', 'Bob', 'Carol', 'David'],
'salary': [50000, 60000, 55000, 70000]
})
# Single aggregation function
result = df.groupby('department')['salary'].agg(lambda x: x.max() - x.min())
print(result) # Salary range per department
# Multiple aggregations with custom names
result = df.groupby('department').agg({
'salary': ['min', 'max', 'mean'],
'name': 'count'
}).rename(columns={'name': 'employee_count'})import pandas as pd
# Sales data
sales = {
'date': ['2024-01-01', '2024-01-02', '2024-01-01', '2024-01-02',
'2024-01-03', '2024-01-03'],
'product': ['Laptop', 'Mouse', 'Laptop', 'Mouse', 'Keyboard', 'Mouse'],
'quantity': [2, 5, 1, 3, 2, 4],
'price_per_unit': [1000, 25, 1000, 25, 50, 25]
}
df = pd.DataFrame(sales)
df['total'] = df['quantity'] * df['price_per_unit']
# Total sales by product
product_totals = df.groupby('product')['total'].sum().sort_values(ascending=False)
print(product_totals)
# Sales by date with multiple metrics
daily_stats = df.groupby('date').agg({
'total': 'sum',
'quantity': 'sum',
'product': 'count' # Number of transactions
}).rename(columns={'product': 'transactions'})
print(daily_stats)
# Best-selling product (by quantity)
best_product = df.groupby('product')['quantity'].sum().idxmax()
print(f"Best seller: {best_product}")
# Average transaction value
avg_transaction = df['total'].mean()
print(f"Avg transaction: ${avg_transaction:.2f}")import pandas as pd
df = pd.DataFrame({
'department': ['Sales', 'Sales', 'IT', 'IT', 'HR', 'HR'],
'name': ['Alice', 'Bob', 'Carol', 'David', 'Eve', 'Frank'],
'salary': [50000, 55000, 60000, 70000, 45000, 48000]
})
# Filter groups with average salary > 55000
dept_avg = df.groupby('department')['salary'].mean()
high_pay_depts = dept_avg[dept_avg > 55000].index
result = df[df['department'].isin(high_pay_depts)]
print(result)import pandas as pd
df = pd.DataFrame({
'category': ['A', 'A', 'B', 'B'],
'value': [10, 20, 30, 40]
})
# Multiple operations on same column
result = df.groupby('category')['value'].agg({
'total': 'sum',
'average': 'mean',
'count': 'count',
'range': lambda x: x.max() - x.min()
})
print(result)
# Different operations on different columns
df['weight'] = [1, 2, 3, 4]
result = df.groupby('category').agg({
'value': 'sum',
'weight': 'mean'
})import pandas as pd
df = pd.DataFrame({
'department': ['Sales', 'Sales', 'IT', 'IT'],
'salary': [50000, 55000, 60000, 70000]
})
# Add department average salary as new column
df['dept_avg'] = df.groupby('department')['salary'].transform('mean')
# Standardize salary within each department
df['salary_normalized'] = df.groupby('department')['salary'].transform(
lambda x: (x - x.mean()) / x.std()
)| Method | Purpose | Example |
|---|---|---|
| `sum()` | Total | `grouped['value'].sum()` |
| `mean()` | Average | `grouped['value'].mean()` |
| `count()` | Count items | `grouped['value'].count()` |
| `min()` | Minimum | `grouped['value'].min()` |
| `max()` | Maximum | `grouped['value'].max()` |
| `agg()` | Custom | `grouped.agg({'col': ['sum', 'mean']})` |
| `transform()` | Apply within group | `grouped.transform(func)` |
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward