
Python
Master outlier detection, anomaly detection, and advanced validation techniques.
import pandas as pd
import numpy as np
from scipy.stats import zscore
df = pd.DataFrame({
'id': range(1, 11),
'value': [10, 12, 11, 13, 100, 14, 15, 13, 12, 11] # 100 is outlier
})
# Method 1: Z-Score (values beyond 3 std devs)
df['z_score'] = np.abs(zscore(df['value']))
outliers_z = df[df['z_score'] > 3]
# Method 2: IQR Method (Tukey's fences)
Q1 = df['value'].quantile(0.25)
Q3 = df['value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_iqr = df[(df['value'] < lower_bound) | (df['value'] > upper_bound)]
# Method 3: Isolation Forest (ML-based)
from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_labels = iso_forest.fit_predict(df[['value']])
df['outlier'] = outlier_labels == -1import pandas as pd
import numpy as np
df = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=30),
'temperature': np.random.normal(25, 2, 30)
})
# Sudden change detection
df['temp_change'] = df['temperature'].diff().abs()
threshold = df['temp_change'].mean() + 3 * df['temp_change'].std()
anomalies = df[df['temp_change'] > threshold]
# Moving average deviation
df['ma'] = df['temperature'].rolling(window=7).mean()
df['deviation'] = (df['temperature'] - df['ma']).abs()
threshold = df['deviation'].mean() + 2 * df['deviation'].std()
anomalies = df[df['deviation'] > threshold]
# Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df['temperature'], model='additive', period=7)
df['residual'] = result.resid
anomalies = df[df['residual'].abs() > df['residual'].std() * 2.5]import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, np.nan, 3, np.nan, 5],
'B': [10, 20, np.nan, 40, 50],
'C': [100, 200, 300, np.nan, 500],
'group': ['X', 'X', 'Y', 'Y', 'Y']
})
# Multiple imputation strategies
# Strategy 1: Forward/backward fill
df['A_ffill'] = df['A'].fillna(method='ffill')
# Strategy 2: Group-wise mean
df['A_group_mean'] = df.groupby('group')['A'].transform(
lambda x: x.fillna(x.mean())
)
# Strategy 3: KNN imputation
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(
imputer.fit_transform(df[['A', 'B', 'C']]),
columns=['A', 'B', 'C']
)
# Strategy 4: Iterative imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter=10)
df_imputed = pd.DataFrame(
imputer.fit_transform(df[['A', 'B', 'C']]),
columns=['A', 'B', 'C']
)
# Strategy 5: MICE (Multiple Imputation by Chained Equations)
# Installation: pip install miceforest
import miceforest as mf
kernel = mf.ImputationKernel(df[['A', 'B', 'C']], datasets=5, save_all_iterations=False)
kernel.mice(iterations=10)
df_imputed = kernel.complete_data()import pandas as pd
from pandera import Column, DataFrameSchema, Check, Index
# Define schema
schema = DataFrameSchema({
'id': Column(int, checks=Check.greater_than(0)),
'email': Column(str, checks=Check.str_matches(r'^[\w\.-]+@[\w\.-]+\.\w+$')),
'age': Column(int, checks=[Check.greater_than_or_equal_to(0), Check.less_than_or_equal_to(150)]),
'score': Column(float, checks=Check.between(0, 100))
})
df = pd.DataFrame({
'id': [1, 2, 3],
'email': ['alice@test.com', 'bob@test.com', 'carol@test.com'],
'age': [25, 30, 150],
'score': [85.5, 92.3, 88.1]
})
# Validate
try:
validated_df = schema.validate(df)
except Exception as e:
print(f"Validation error: {e}")import pandas as pd
df = pd.DataFrame({
'name': ['John Smith', 'john smith', 'John Smith', 'Jane Doe'],
'email': ['john@test.com', 'john@test.com', 'john@test.com', 'jane@test.com'],
'phone': ['555-1234', '5551234', '555-1234', '555-5678']
})
# Fuzzy matching for similar names
from difflib import SequenceMatcher
def are_similar(a, b, ratio=0.8):
return SequenceMatcher(None, a.lower(), b.lower()).ratio() >= ratio
# Identify potential duplicates
potential_dups = []
for i, row1 in df.iterrows():
for j, row2 in df.iterrows():
if i < j:
if (are_similar(row1['name'], row2['name']) and
row1['email'] == row2['email']):
potential_dups.append((i, j))
print(f"Potential duplicate pairs: {potential_dups}")
# Using fuzzy matching library
from fuzzywuzzy import fuzz
def fuzzy_duplicate_detection(df, column, threshold=90):
"""Detect fuzzy duplicates"""
duplicates = []
for i in range(len(df)):
for j in range(i+1, len(df)):
ratio = fuzz.token_sort_ratio(
df.iloc[i][column],
df.iloc[j][column]
)
if ratio > threshold:
duplicates.append((i, j, ratio))
return duplicates
dups = fuzzy_duplicate_detection(df, 'name')import pandas as pd
import numpy as np
df = pd.DataFrame({
'amount': ['$1,000.50', '$2000', '$3,500.99'],
'date': ['01/15/2024', '1-15-2024', '2024-01-15'],
'category': ['SALES', 'sales', 'Sales']
})
# Currency parsing
def parse_currency(val):
if pd.isna(val):
return np.nan
return float(str(val).replace('$', '').replace(',', ''))
df['amount'] = df['amount'].apply(parse_currency)
# Flexible date parsing
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# Standardize categories
df['category'] = df['category'].str.strip().str.upper()
# Enum validation
valid_categories = ['SALES', 'MARKETING', 'OPERATIONS']
df = df[df['category'].isin(valid_categories)]import pandas as pd
def generate_quality_report(df):
"""Generate comprehensive data quality report"""
report = {
'Total Records': len(df),
'Total Columns': len(df.columns),
'Memory Usage (MB)': df.memory_usage(deep=True).sum() / 1024**2,
'Column Stats': {}
}
for col in df.columns:
stats = {
'Type': str(df[col].dtype),
'Non-Null Count': df[col].notna().sum(),
'Null Count': df[col].isna().sum(),
'Null %': f"{(df[col].isna().sum() / len(df) * 100):.2f}%",
'Unique': df[col].nunique() if df[col].dtype == 'object' else None,
'Min': df[col].min() if df[col].dtype in ['int64', 'float64'] else None,
'Max': df[col].max() if df[col].dtype in ['int64', 'float64'] else None,
'Mean': df[col].mean() if df[col].dtype in ['int64', 'float64'] else None,
}
report['Column Stats'][col] = stats
return report
df = pd.DataFrame({
'id': [1, 2, 3, 4, 5],
'name': ['A', 'B', 'C', None, 'E'],
'value': [10, 20, None, 40, 50]
})
report = generate_quality_report(df)
for col, stats in report['Column Stats'].items():
print(f"\n{col}:")
for k, v in stats.items():
print(f" {k}: {v}")Continue: Advanced NumPy | Advanced Visualization
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep โข Build Real โข Verify Skills โข Launch Forward