Ojasa Mirai

Python

Learning Level

Data Processing Overview CSV Data Handling Pandas Basics DataFrames Data Filtering Aggregation & Grouping Data Cleaning & Wrangling NumPy Arrays Data Visualization Basics

Python/Data Processing/Data Cleaning Wrangling

🔬 Advanced Data Cleaning & Wrangling — Production-Grade Data Quality

Master outlier detection, anomaly detection, and advanced validation techniques.

🎯 Outlier Detection

import pandas as pd
import numpy as np
from scipy.stats import zscore

df = pd.DataFrame({
    'id': range(1, 11),
    'value': [10, 12, 11, 13, 100, 14, 15, 13, 12, 11]  # 100 is outlier
})

# Method 1: Z-Score (values beyond 3 std devs)
df['z_score'] = np.abs(zscore(df['value']))
outliers_z = df[df['z_score'] > 3]

# Method 2: IQR Method (Tukey's fences)
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]

# Method 3: Isolation Forest (ML-based)
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_labels = iso_forest.fit_predict(df[['value']])
df['outlier'] = outlier_labels == -1

🔍 Anomaly Detection

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=30),
    'temperature': np.random.normal(25, 2, 30)
})

# Sudden change detection
df['temp_change'] = df['temperature'].diff().abs()
threshold = df['temp_change'].mean() + 3 * df['temp_change'].std()
anomalies = df[df['temp_change'] > threshold]

# Moving average deviation
df['ma'] = df['temperature'].rolling(window=7).mean()
df['deviation'] = (df['temperature'] - df['ma']).abs()
threshold = df['deviation'].mean() + 2 * df['deviation'].std()
anomalies = df[df['deviation'] > threshold]

# Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df['temperature'], model='additive', period=7)
df['residual'] = result.resid
anomalies = df[df['residual'].abs() > df['residual'].std() * 2.5]

🧹 Advanced Missing Value Handling

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [1, np.nan, 3, np.nan, 5],
    'B': [10, 20, np.nan, 40, 50],
    'C': [100, 200, 300, np.nan, 500],
    'group': ['X', 'X', 'Y', 'Y', 'Y']
})

# Multiple imputation strategies
# Strategy 1: Forward/backward fill
df['A_ffill'] = df['A'].fillna(method='ffill')

# Strategy 2: Group-wise mean
df['A_group_mean'] = df.groupby('group')['A'].transform(
    lambda x: x.fillna(x.mean())
)

# Strategy 3: KNN imputation
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(
    imputer.fit_transform(df[['A', 'B', 'C']]),
    columns=['A', 'B', 'C']
)

# Strategy 4: Iterative imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10)
df_imputed = pd.DataFrame(
    imputer.fit_transform(df[['A', 'B', 'C']]),
    columns=['A', 'B', 'C']
)

# Strategy 5: MICE (Multiple Imputation by Chained Equations)
# Installation: pip install miceforest
import miceforest as mf

kernel = mf.ImputationKernel(df[['A', 'B', 'C']], datasets=5, save_all_iterations=False)
kernel.mice(iterations=10)
df_imputed = kernel.complete_data()

📊 Data Consistency Validation

import pandas as pd
from pandera import Column, DataFrameSchema, Check, Index

# Define schema
schema = DataFrameSchema({
    'id': Column(int, checks=Check.greater_than(0)),
    'email': Column(str, checks=Check.str_matches(r'^[\w\.-]+@[\w\.-]+\.\w+$')),
    'age': Column(int, checks=[Check.greater_than_or_equal_to(0), Check.less_than_or_equal_to(150)]),
    'score': Column(float, checks=Check.between(0, 100))
})

df = pd.DataFrame({
    'id': [1, 2, 3],
    'email': ['alice@test.com', 'bob@test.com', 'carol@test.com'],
    'age': [25, 30, 150],
    'score': [85.5, 92.3, 88.1]
})

# Validate
try:
    validated_df = schema.validate(df)
except Exception as e:
    print(f"Validation error: {e}")

🔄 Deduplication Advanced

import pandas as pd

df = pd.DataFrame({
    'name': ['John Smith', 'john smith', 'John  Smith', 'Jane Doe'],
    'email': ['john@test.com', 'john@test.com', 'john@test.com', 'jane@test.com'],
    'phone': ['555-1234', '5551234', '555-1234', '555-5678']
})

# Fuzzy matching for similar names
from difflib import SequenceMatcher

def are_similar(a, b, ratio=0.8):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio() >= ratio

# Identify potential duplicates
potential_dups = []
for i, row1 in df.iterrows():
    for j, row2 in df.iterrows():
        if i < j:
            if (are_similar(row1['name'], row2['name']) and
                row1['email'] == row2['email']):
                potential_dups.append((i, j))

print(f"Potential duplicate pairs: {potential_dups}")

# Using fuzzy matching library
from fuzzywuzzy import fuzz

def fuzzy_duplicate_detection(df, column, threshold=90):
    """Detect fuzzy duplicates"""
    duplicates = []
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            ratio = fuzz.token_sort_ratio(
                df.iloc[i][column],
                df.iloc[j][column]
            )
            if ratio > threshold:
                duplicates.append((i, j, ratio))
    return duplicates

dups = fuzzy_duplicate_detection(df, 'name')

🔐 Type Coercion and Standardization

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'amount': ['$1,000.50', '$2000', '$3,500.99'],
    'date': ['01/15/2024', '1-15-2024', '2024-01-15'],
    'category': ['SALES', 'sales', 'Sales']
})

# Currency parsing
def parse_currency(val):
    if pd.isna(val):
        return np.nan
    return float(str(val).replace('$', '').replace(',', ''))

df['amount'] = df['amount'].apply(parse_currency)

# Flexible date parsing
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)

# Standardize categories
df['category'] = df['category'].str.strip().str.upper()

# Enum validation
valid_categories = ['SALES', 'MARKETING', 'OPERATIONS']
df = df[df['category'].isin(valid_categories)]

📈 Data Quality Reporting

import pandas as pd

def generate_quality_report(df):
    """Generate comprehensive data quality report"""

    report = {
        'Total Records': len(df),
        'Total Columns': len(df.columns),
        'Memory Usage (MB)': df.memory_usage(deep=True).sum() / 1024**2,
        'Column Stats': {}
    }

    for col in df.columns:
        stats = {
            'Type': str(df[col].dtype),
            'Non-Null Count': df[col].notna().sum(),
            'Null Count': df[col].isna().sum(),
            'Null %': f"{(df[col].isna().sum() / len(df) * 100):.2f}%",
            'Unique': df[col].nunique() if df[col].dtype == 'object' else None,
            'Min': df[col].min() if df[col].dtype in ['int64', 'float64'] else None,
            'Max': df[col].max() if df[col].dtype in ['int64', 'float64'] else None,
            'Mean': df[col].mean() if df[col].dtype in ['int64', 'float64'] else None,
        }
        report['Column Stats'][col] = stats

    return report

df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['A', 'B', 'C', None, 'E'],
    'value': [10, 20, None, 40, 50]
})

report = generate_quality_report(df)
for col, stats in report['Column Stats'].items():
    print(f"\n{col}:")
    for k, v in stats.items():
        print(f"  {k}: {v}")

🔑 Key Takeaways

✅ Use Z-score and IQR methods to detect statistical outliers

✅ Use Isolation Forest for ML-based anomaly detection

✅ Implement multiple imputation strategies for missing values

✅ Use fuzzy matching for detecting near-duplicates

✅ Validate schemas with Pandera library

✅ Generate quality reports to document data health

✅ Choose appropriate imputation based on data characteristics

Continue: Advanced NumPy | Advanced Visualization

Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

Python Fastapi ReactJS Cloud

Resources

Blog & Articles GitHub Projects Video Tutorials

Ecosystem

Ojasa Mirai Site My Growth Learning Portal Community Discord

Twitter GitHub LinkedIn