Ojasa Mirai

Python

Learning Level

Data Processing Overview CSV Data Handling Pandas Basics DataFrames Data Filtering Aggregation & Grouping Data Cleaning & Wrangling NumPy Arrays Data Visualization Basics

Python/Data Processing/Dataframes

📊 Advanced DataFrames — Sophisticated Data Transformation

Master pivot tables, complex reshaping, and advanced transformation techniques.

🎯 Pivot Tables with Multiple Aggregations

import pandas as pd

df = pd.DataFrame({
    'region': ['North', 'North', 'South', 'South', 'East', 'East'],
    'quarter': ['Q1', 'Q2', 'Q1', 'Q2', 'Q1', 'Q2'],
    'sales': [100, 120, 150, 180, 90, 110],
    'profit': [20, 25, 30, 36, 18, 22]
})

# Basic pivot table
pivot = pd.pivot_table(
    df,
    values='sales',
    index='region',
    columns='quarter',
    aggfunc='sum'
)

# Multiple aggregations
pivot = pd.pivot_table(
    df,
    values=['sales', 'profit'],
    index='region',
    columns='quarter',
    aggfunc=['sum', 'mean', 'count']
)

# Custom aggregation
pivot = df.pivot_table(
    values='sales',
    index='region',
    columns='quarter',
    aggfunc={'sales': ['sum', 'count']}
)

# Margin totals
pivot = df.pivot_table(
    values='sales',
    index='region',
    columns='quarter',
    aggfunc='sum',
    margins=True
)

🔄 Complex Reshaping

import pandas as pd

# Wide to long transformation
df_wide = pd.DataFrame({
    'id': [1, 2, 3],
    '2022': [100, 150, 120],
    '2023': [110, 160, 125],
    '2024': [120, 170, 130]
})

df_long = pd.melt(
    df_wide,
    id_vars=['id'],
    var_name='year',
    value_name='sales'
)

# Multiple value columns
df_wide = pd.DataFrame({
    'id': [1, 2],
    'sales_2022': [100, 150],
    'profit_2022': [20, 30],
    'sales_2023': [110, 160],
    'profit_2023': [22, 32]
})

df_long = pd.melt(
    df_wide,
    id_vars=['id'],
    value_vars=['sales_2022', 'profit_2022', 'sales_2023', 'profit_2023']
)

# Extract metric and year from variable name
df_long[['metric', 'year']] = df_long['variable'].str.split('_', expand=True)

🔀 Advanced Merging Strategies

import pandas as pd

df1 = pd.DataFrame({
    'key': ['A', 'B', 'C'],
    'value1': [1, 2, 3]
})

df2 = pd.DataFrame({
    'key': ['A', 'B', 'D'],
    'value2': [10, 20, 40]
})

# Merge with different key columns
df3 = pd.DataFrame({
    'id': ['A', 'B', 'C'],
    'value3': [100, 200, 300]
})

result = df1.merge(df3, left_on='key', right_on='id', how='inner')

# Merge on multiple keys
df1_multi = pd.DataFrame({
    'key1': ['A', 'A', 'B'],
    'key2': [1, 2, 1],
    'value': [10, 20, 30]
})

df2_multi = pd.DataFrame({
    'key1': ['A', 'B'],
    'key2': [1, 1],
    'value2': [100, 200]
})

result = df1_multi.merge(df2_multi, on=['key1', 'key2'], how='left')

# Asof merge (nearest match)
df_left = pd.DataFrame({
    'time': [1, 5, 10],
    'value': [10, 20, 30]
})

df_right = pd.DataFrame({
    'time': [1, 2, 3, 6, 7],
    'price': [100, 105, 110, 120, 125]
})

result = pd.merge_asof(df_left, df_right, on='time', direction='nearest')

📊 Crosstabs and Contingency Tables

import pandas as pd

df = pd.DataFrame({
    'age_group': ['18-30', '18-30', '30-50', '30-50', '50+'],
    'product': ['A', 'B', 'A', 'B', 'A'],
    'purchase': [1, 1, 1, 0, 1]
})

# Simple crosstab
crosstab = pd.crosstab(df['age_group'], df['product'])

# Crosstab with values and aggregation
crosstab = pd.crosstab(
    df['age_group'],
    df['product'],
    values=df['purchase'],
    aggfunc='sum'
)

# Crosstab with margins
crosstab = pd.crosstab(
    df['age_group'],
    df['product'],
    margins=True
)

# Normalize (percentages)
crosstab_pct = pd.crosstab(
    df['age_group'],
    df['product'],
    normalize='index'
) * 100

⚡ Memory Optimization Strategies

import pandas as pd
import numpy as np

def optimize_dataframe(df):
    """Optimize memory usage"""

    for column in df.columns:
        col_type = df[column].dtype

        # Optimize integers
        if col_type == 'int64':
            c_min = df[column].min()
            c_max = df[column].max()

            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[column] = df[column].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[column] = df[column].astype(np.int16)

        # Optimize floats
        elif col_type == 'float64':
            df[column] = df[column].astype(np.float32)

        # Optimize objects (strings)
        elif col_type == 'object':
            num_unique = len(df[column].unique())
            num_total = len(df[column])
            if num_unique / num_total < 0.5:
                df[column] = df[column].astype('category')

    return df

# Usage
df = pd.DataFrame({
    'id': range(1000),
    'value': np.random.randn(1000),
    'category': np.random.choice(['A', 'B', 'C'], 1000)
})

print(df.memory_usage(deep=True).sum() / 1024**2)  # MB before

df = optimize_dataframe(df)

print(df.memory_usage(deep=True).sum() / 1024**2)  # MB after

🎯 Chunked Processing with Progress

import pandas as pd
from tqdm import tqdm

def process_large_dataframe(filepath, chunk_size=10000, processor=None):
    """Process large DataFrame with progress bar"""

    total_rows = sum(1 for line in open(filepath)) - 1

    results = []
    with tqdm(total=total_rows) as pbar:
        for chunk in pd.read_csv(filepath, chunksize=chunk_size):
            if processor:
                result = processor(chunk)
            else:
                result = chunk

            results.append(result)
            pbar.update(len(chunk))

    return pd.concat(results, ignore_index=True)

# Example processor
def processor(chunk):
    chunk['value_squared'] = chunk['value'] ** 2
    return chunk[chunk['value_squared'] > 100]

🔀 Duplicate Handling Advanced

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Alice', 'Carol'],
    'age': [25, 30, 25, 28],
    'salary': [50000, 60000, 50000, 55000]
})

# Find all duplicates
all_dups = df[df.duplicated(subset=['name'], keep=False)]

# Mark duplicates
df['is_duplicate'] = df.duplicated(subset=['name'], keep=False)

# Keep maximum salary per person
df_deduped = df.sort_values('salary', ascending=False).drop_duplicates('name')

# Custom deduplication logic
def keep_highest_salary(group):
    return group.nlargest(1, 'salary')

df_deduped = df.groupby('name', group_keys=False).apply(keep_highest_salary)

📈 Handling Missing Data Patterns

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [1, np.nan, 3, np.nan, 5],
    'B': [np.nan, 2, np.nan, 4, 5],
    'C': [10, 20, 30, 40, 50]
})

# Forward fill within groups
df['A_filled'] = df['A'].fillna(method='ffill')

# Fill with group mean
df['A_filled'] = df.groupby('group')['A'].transform(
    lambda x: x.fillna(x.mean())
)

# Interpolate time series
df['A_interpolated'] = df['A'].interpolate(method='linear')

# Fill with most common value
df['A_filled'] = df['A'].fillna(df['A'].mode()[0])

# Drop rows where critical column is missing
df_clean = df.dropna(subset=['C'])

🔑 Key Takeaways

✅ Use pivot tables for summarizing multidimensional data

✅ Master melt() for reshaping wide to long format

✅ Use merge types strategically (inner, outer, left, right, asof)

✅ Optimize memory with appropriate dtypes

✅ Use categorical dtype for repetitive string values

✅ Process large DataFrames in chunks

✅ Normalize crosstabs for percentage analysis

Continue: Advanced Data Filtering | Advanced Aggregation

Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

Python Fastapi ReactJS Cloud

Resources

Blog & Articles GitHub Projects Video Tutorials

Ecosystem

Ojasa Mirai Site My Growth Learning Portal Community Discord

Twitter GitHub LinkedIn