
Python
Master advanced indexing, categorical data, multi-level operations, and optimization techniques.
import pandas as pd
import numpy as np
# Create MultiIndex DataFrame
arrays = [
['Q1', 'Q1', 'Q2', 'Q2'],
['Product A', 'Product B', 'Product A', 'Product B']
]
index = pd.MultiIndex.from_arrays(arrays, names=['Quarter', 'Product'])
df = pd.DataFrame({
'Sales': [100, 150, 120, 180],
'Profit': [20, 30, 25, 40]
}, index=index)
print(df)
# Access multi-level data
print(df.loc['Q1']) # All Q1
print(df.loc[('Q1', 'Product A')]) # Specific combination
# Unstack for pivot-like transformation
pivoted = df.unstack()
print(pivoted)
# Stack back
stacked = pivoted.stack()import pandas as pd
df = pd.DataFrame({
'department': ['Sales', 'Sales', 'IT', 'IT', 'HR', 'HR'],
'employee': ['Alice', 'Bob', 'Carol', 'David', 'Eve', 'Frank'],
'salary': [50000, 55000, 60000, 70000, 45000, 48000],
'years': [2, 5, 3, 8, 1, 4]
})
# Named aggregations
result = df.groupby('department').agg(
avg_salary=('salary', 'mean'),
total_salary=('salary', 'sum'),
emp_count=('employee', 'count'),
max_years=('years', 'max')
)
# Multiple functions on different columns
result = df.groupby('department').agg({
'salary': ['mean', 'std', 'min', 'max'],
'years': ['mean', 'count']
})
# Custom aggregation function
def salary_range(x):
return x.max() - x.min()
result = df.groupby('department')['salary'].agg([
('avg', 'mean'),
('range', salary_range),
('count', 'size')
])
# Group and apply complex logic
def top_earner(group):
return group.nlargest(1, 'salary')
top_per_dept = df.groupby('department').apply(top_earner)
print(top_per_dept)Optimize memory and performance with categorical data types.
import pandas as pd
df = pd.DataFrame({
'color': ['red', 'blue', 'red', 'green', 'blue', 'red'],
'size': ['S', 'M', 'L', 'S', 'M', 'L'],
'price': [10, 20, 30, 10, 20, 30]
})
# Convert to categorical
df['color'] = df['color'].astype('category')
df['size'] = df['size'].astype('category', categories=['S', 'M', 'L'], ordered=True)
print(df.memory_usage(deep=True)) # Less memory
# Categorical operations
print(df['color'].cat.categories)
print(df['color'].cat.codes)
# Add new category
df['color'] = df['color'].cat.add_categories(['yellow'])
# Rename categories
df['color'] = df['color'].cat.rename_categories({
'red': 'RED',
'blue': 'BLUE'
})
# Remove categories
df['color'] = df['color'].cat.remove_categories(['yellow'])import pandas as pd
# Create custom index
dates = pd.date_range('2024-01-01', periods=5)
df = pd.DataFrame({
'value': [100, 110, 105, 115, 120]
}, index=dates)
# DatetimeIndex operations
print(df['2024-01-02':'2024-01-04']) # Slice by date range
print(df[df.index.day > 2]) # Filter by day
# MultiIndex
index = pd.MultiIndex.from_product(
[['A', 'B'], ['X', 'Y']],
names=['Group', 'Type']
)
df = pd.DataFrame({'value': [1, 2, 3, 4]}, index=index)
# Access multi-level
print(df.xs('A', level='Group'))
# Custom Index
custom_idx = pd.Index(['product_1', 'product_2', 'product_3'], name='product_id')
df = pd.DataFrame({'sales': [100, 150, 120]}, index=custom_idx)import pandas as pd
import numpy as np
# Strategy 1: Use itertuples instead of iterrows
df = pd.DataFrame({'A': range(1000), 'B': range(1000, 2000)})
# Slow
for idx, row in df.iterrows():
df.loc[idx, 'C'] = row['A'] + row['B']
# Fast
df['C'] = [row.A + row.B for row in df.itertuples()]
# Strategy 2: Use apply with optimized function
df = pd.DataFrame({'A': range(100), 'B': range(100, 200)})
df['C'] = df.apply(lambda row: row['A'] * row['B'], axis=1)
# Strategy 3: Vectorized operations (fastest)
df['C'] = df['A'] * df['B']
# Strategy 4: Reduce DataFrame size
df['category'] = df['category'].astype('category')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['value'] = df['value'].astype('float32')
# Strategy 5: Use query for complex filters
result = df.query('A > 500 and B < 1500')import pandas as pd
# Create time series
dates = pd.date_range('2024-01-01', periods=100, freq='D')
df = pd.DataFrame({
'value': range(100)
}, index=dates)
# Resampling
daily = df.resample('D').mean()
weekly = df.resample('W').sum()
monthly = df.resample('M').max()
# Rolling window
df['rolling_mean'] = df['value'].rolling(window=7).mean()
df['rolling_std'] = df['value'].rolling(window=7).std()
# Shift and diff
df['prev_value'] = df['value'].shift(1)
df['change'] = df['value'].diff()
# Time-based filtering
recent = df['2024-01-15':'2024-01-31']import pandas as pd
df1 = pd.DataFrame({
'key': ['A', 'B', 'C'],
'value1': [1, 2, 3]
})
df2 = pd.DataFrame({
'key': ['A', 'B', 'D'],
'value2': [10, 20, 40]
})
# Different join types
inner = pd.merge(df1, df2, on='key', how='inner') # A, B only
outer = pd.merge(df1, df2, on='key', how='outer') # A, B, C, D
left = pd.merge(df1, df2, on='key', how='left') # A, B, C
right = pd.merge(df1, df2, on='key', how='right') # A, B, D
# Merge on index
df1_idx = df1.set_index('key')
df2_idx = df2.set_index('key')
result = df1_idx.join(df2_idx, how='inner')
# Concat with keys
combined = pd.concat([df1, df2], keys=['first', 'second'])import pandas as pd
df = pd.DataFrame({
'group': ['A', 'A', 'A', 'B', 'B', 'B'],
'value': [10, 20, 30, 15, 25, 35]
})
# Rank within group
df['rank'] = df.groupby('group')['value'].rank(ascending=False)
# Cumulative sum within group
df['cumsum'] = df.groupby('group')['value'].cumsum()
# Percentage of group total
df['pct_of_group'] = (
df['value'] / df.groupby('group')['value'].transform('sum') * 100
)
# Difference from group mean
df['diff_from_mean'] = (
df['value'] - df.groupby('group')['value'].transform('mean')
)
print(df)Continue: Advanced DataFrames | Advanced Data Filtering
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward