
Python
Learn to identify and fix data quality issues before analysis.
Real-world data often has problems: missing values, duplicates, inconsistencies, and incorrect formats.
import pandas as pd
# Example of messy data
messy_data = {
'name': ['Alice', 'Bob', None, 'David', 'Alice'],
'age': [25, 30, 28, None, 25],
'salary': [50000, 60000, 55000, 70000, 50000],
'city': ['new york', 'LONDON', 'paris', 'berlin', 'New York']
}
df = pd.DataFrame(messy_data)
print(df)
# Missing values (None), duplicate rows, inconsistent formattingimport pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', None, 'David'],
'age': [25, 30, 28, None],
'salary': [50000, None, 55000, 70000]
})
# Check for missing values
print(df.isnull()) # Boolean mask
print(df.isnull().sum()) # Count per column
print(df.isnull().sum().sum()) # Total missing
# Check for non-null values
print(df.notnull())
print(df.notnull().sum())
# Percentage missing
print((df.isnull().sum() / len(df)) * 100)import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', None, 'David'],
'age': [25, 30, 28, None],
'salary': [50000, None, 55000, 70000]
})
# Option 1: Drop rows with missing values
df_dropped = df.dropna()
# Option 2: Drop rows if any column is missing
df_dropped = df.dropna(how='any')
# Option 3: Drop rows if all columns are missing
df_dropped = df.dropna(how='all')
# Option 4: Fill missing values
df_filled = df.fillna(0) # Fill with constant
df_filled = df.fillna(df.mean()) # Fill with mean (numeric only)
df_filled = df.fillna(method='ffill') # Forward fill
df_filled = df.fillna(method='bfill') # Backward fill
# Option 5: Forward fill then backward fill
df_filled = df.fillna(method='ffill').fillna(method='bfill')import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Alice', 'Carol'],
'age': [25, 30, 25, 28],
'salary': [50000, 60000, 50000, 55000]
})
# Find duplicates
print(df.duplicated()) # Boolean: which rows are duplicates
print(df.duplicated(subset=['name'])) # Check specific columns
# Count duplicates
print(df.duplicated().sum())
# Remove duplicates
df_unique = df.drop_duplicates()
df_unique = df.drop_duplicates(subset=['name']) # Keep first occurrence
# Keep last occurrence instead
df_unique = df.drop_duplicates(keep='last')import pandas as pd
df = pd.DataFrame({
'age': ['25', '30', '28'], # String, should be int
'salary': ['50000', '60000', '55000'], # String, should be float
'date': ['2024-01-01', '2024-01-02', '2024-01-03'] # String, should be date
})
# Convert data types
df['age'] = df['age'].astype(int)
df['salary'] = df['salary'].astype(float)
df['date'] = pd.to_datetime(df['date'])
print(df.dtypes)
# Handle conversion errors
df['age'] = pd.to_numeric(df['age'], errors='coerce') # Convert to NaN if errorimport pandas as pd
df = pd.DataFrame({
'city': [' new york ', 'LONDON', 'paris', ' BERLIN '],
'name': ['ALICE', 'bob', 'Carol', 'DAVID']
})
# Remove whitespace
df['city'] = df['city'].str.strip()
# Convert case
df['city'] = df['city'].str.lower()
df['name'] = df['name'].str.title()
# Replace values
df['city'] = df['city'].str.replace(' ', '_')
# Extract substring
df['city_code'] = df['city'].str[:3].upper()
print(df)import pandas as pd
# Messy customer data
raw_data = {
'customer_id': ['C001', 'C002', 'C001', 'C003', None],
'name': ['ALICE SMITH', 'bob johnson', None, 'Carol Davis', 'david lee'],
'email': ['alice@example.com', 'BOB@EXAMPLE.COM', 'alice@example.com', 'unknown', 'dave@test.com'],
'age': ['25', '30', '25', 'unknown', '35'],
'signup_date': ['2024-01-01', '2024-01-02', '2024-01-01', '2024-01-03', '2024-01-04']
}
df = pd.DataFrame(raw_data)
# Step 1: Remove duplicate rows
df = df.drop_duplicates(subset=['customer_id', 'name'])
# Step 2: Remove rows with critical missing values
df = df.dropna(subset=['customer_id'])
# Step 3: Fill optional missing values
df['name'].fillna('Unknown', inplace=True)
df['email'].fillna('noemail@example.com', inplace=True)
# Step 4: Clean age (convert and handle 'unknown')
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['age'].fillna(df['age'].median(), inplace=True)
# Step 5: Standardize text fields
df['name'] = df['name'].str.title().str.strip()
df['email'] = df['email'].str.lower().str.strip()
# Step 6: Convert date
df['signup_date'] = pd.to_datetime(df['signup_date'])
print(df)
print(df.dtypes)
print(df.isnull().sum())import pandas as pd
df = pd.DataFrame({
'age': [25, 30, 150, 28], # 150 is unrealistic
'salary': [50000, 60000, -1000, 70000] # Negative salary
})
# Check for outliers/invalid values
print(df[(df['age'] < 0) | (df['age'] > 120)])
print(df[df['salary'] < 0])
# Remove invalid rows
df = df[(df['age'] > 0) & (df['age'] < 120)]
df = df[df['salary'] > 0]| Step | Purpose | Method |
|---|---|---|
| Duplicates | Remove exact duplicates | `drop_duplicates()` |
| Missing | Handle missing values | `dropna()`, `fillna()` |
| Format | Fix data types | `astype()`, `to_datetime()` |
| Text | Standardize text | `.str.lower()`, `.str.strip()` |
| Validation | Check for invalid values | Condition-based filtering |
| Consistency | Ensure consistent format | Replace/mapping operations |
Continue: NumPy Arrays | Data Visualization
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep โข Build Real โข Verify Skills โข Launch Forward