
Python
Master pivot tables, complex reshaping, and advanced transformation techniques.
import pandas as pd
df = pd.DataFrame({
'region': ['North', 'North', 'South', 'South', 'East', 'East'],
'quarter': ['Q1', 'Q2', 'Q1', 'Q2', 'Q1', 'Q2'],
'sales': [100, 120, 150, 180, 90, 110],
'profit': [20, 25, 30, 36, 18, 22]
})
# Basic pivot table
pivot = pd.pivot_table(
df,
values='sales',
index='region',
columns='quarter',
aggfunc='sum'
)
# Multiple aggregations
pivot = pd.pivot_table(
df,
values=['sales', 'profit'],
index='region',
columns='quarter',
aggfunc=['sum', 'mean', 'count']
)
# Custom aggregation
pivot = df.pivot_table(
values='sales',
index='region',
columns='quarter',
aggfunc={'sales': ['sum', 'count']}
)
# Margin totals
pivot = df.pivot_table(
values='sales',
index='region',
columns='quarter',
aggfunc='sum',
margins=True
)import pandas as pd
# Wide to long transformation
df_wide = pd.DataFrame({
'id': [1, 2, 3],
'2022': [100, 150, 120],
'2023': [110, 160, 125],
'2024': [120, 170, 130]
})
df_long = pd.melt(
df_wide,
id_vars=['id'],
var_name='year',
value_name='sales'
)
# Multiple value columns
df_wide = pd.DataFrame({
'id': [1, 2],
'sales_2022': [100, 150],
'profit_2022': [20, 30],
'sales_2023': [110, 160],
'profit_2023': [22, 32]
})
df_long = pd.melt(
df_wide,
id_vars=['id'],
value_vars=['sales_2022', 'profit_2022', 'sales_2023', 'profit_2023']
)
# Extract metric and year from variable name
df_long[['metric', 'year']] = df_long['variable'].str.split('_', expand=True)import pandas as pd
df1 = pd.DataFrame({
'key': ['A', 'B', 'C'],
'value1': [1, 2, 3]
})
df2 = pd.DataFrame({
'key': ['A', 'B', 'D'],
'value2': [10, 20, 40]
})
# Merge with different key columns
df3 = pd.DataFrame({
'id': ['A', 'B', 'C'],
'value3': [100, 200, 300]
})
result = df1.merge(df3, left_on='key', right_on='id', how='inner')
# Merge on multiple keys
df1_multi = pd.DataFrame({
'key1': ['A', 'A', 'B'],
'key2': [1, 2, 1],
'value': [10, 20, 30]
})
df2_multi = pd.DataFrame({
'key1': ['A', 'B'],
'key2': [1, 1],
'value2': [100, 200]
})
result = df1_multi.merge(df2_multi, on=['key1', 'key2'], how='left')
# Asof merge (nearest match)
df_left = pd.DataFrame({
'time': [1, 5, 10],
'value': [10, 20, 30]
})
df_right = pd.DataFrame({
'time': [1, 2, 3, 6, 7],
'price': [100, 105, 110, 120, 125]
})
result = pd.merge_asof(df_left, df_right, on='time', direction='nearest')import pandas as pd
df = pd.DataFrame({
'age_group': ['18-30', '18-30', '30-50', '30-50', '50+'],
'product': ['A', 'B', 'A', 'B', 'A'],
'purchase': [1, 1, 1, 0, 1]
})
# Simple crosstab
crosstab = pd.crosstab(df['age_group'], df['product'])
# Crosstab with values and aggregation
crosstab = pd.crosstab(
df['age_group'],
df['product'],
values=df['purchase'],
aggfunc='sum'
)
# Crosstab with margins
crosstab = pd.crosstab(
df['age_group'],
df['product'],
margins=True
)
# Normalize (percentages)
crosstab_pct = pd.crosstab(
df['age_group'],
df['product'],
normalize='index'
) * 100import pandas as pd
import numpy as np
def optimize_dataframe(df):
"""Optimize memory usage"""
for column in df.columns:
col_type = df[column].dtype
# Optimize integers
if col_type == 'int64':
c_min = df[column].min()
c_max = df[column].max()
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[column] = df[column].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[column] = df[column].astype(np.int16)
# Optimize floats
elif col_type == 'float64':
df[column] = df[column].astype(np.float32)
# Optimize objects (strings)
elif col_type == 'object':
num_unique = len(df[column].unique())
num_total = len(df[column])
if num_unique / num_total < 0.5:
df[column] = df[column].astype('category')
return df
# Usage
df = pd.DataFrame({
'id': range(1000),
'value': np.random.randn(1000),
'category': np.random.choice(['A', 'B', 'C'], 1000)
})
print(df.memory_usage(deep=True).sum() / 1024**2) # MB before
df = optimize_dataframe(df)
print(df.memory_usage(deep=True).sum() / 1024**2) # MB afterimport pandas as pd
from tqdm import tqdm
def process_large_dataframe(filepath, chunk_size=10000, processor=None):
"""Process large DataFrame with progress bar"""
total_rows = sum(1 for line in open(filepath)) - 1
results = []
with tqdm(total=total_rows) as pbar:
for chunk in pd.read_csv(filepath, chunksize=chunk_size):
if processor:
result = processor(chunk)
else:
result = chunk
results.append(result)
pbar.update(len(chunk))
return pd.concat(results, ignore_index=True)
# Example processor
def processor(chunk):
chunk['value_squared'] = chunk['value'] ** 2
return chunk[chunk['value_squared'] > 100]import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Alice', 'Carol'],
'age': [25, 30, 25, 28],
'salary': [50000, 60000, 50000, 55000]
})
# Find all duplicates
all_dups = df[df.duplicated(subset=['name'], keep=False)]
# Mark duplicates
df['is_duplicate'] = df.duplicated(subset=['name'], keep=False)
# Keep maximum salary per person
df_deduped = df.sort_values('salary', ascending=False).drop_duplicates('name')
# Custom deduplication logic
def keep_highest_salary(group):
return group.nlargest(1, 'salary')
df_deduped = df.groupby('name', group_keys=False).apply(keep_highest_salary)import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, np.nan, 3, np.nan, 5],
'B': [np.nan, 2, np.nan, 4, 5],
'C': [10, 20, 30, 40, 50]
})
# Forward fill within groups
df['A_filled'] = df['A'].fillna(method='ffill')
# Fill with group mean
df['A_filled'] = df.groupby('group')['A'].transform(
lambda x: x.fillna(x.mean())
)
# Interpolate time series
df['A_interpolated'] = df['A'].interpolate(method='linear')
# Fill with most common value
df['A_filled'] = df['A'].fillna(df['A'].mode()[0])
# Drop rows where critical column is missing
df_clean = df.dropna(subset=['C'])Continue: Advanced Data Filtering | Advanced Aggregation
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward