Ojasa Mirai

Python

Learning Level

String Basics & Creation String Indexing & Slicing Common String Methods String Formatting Essentials Finding & Replacing Text String Testing & Validation Regular Expression Basics Text Splitting & Joining Practical String Projects

Python/String Manipulation/String Testing Validation

✅ String Testing & Validation — Advanced Techniques

Master Unicode categories, fuzzy matching, and building robust validators.

🎯 Unicode Category Testing

import unicodedata

def test_unicode_category(text):
    """Analyze Unicode categories"""
    for char in text:
        category = unicodedata.category(char)
        name = unicodedata.name(char, "UNKNOWN")
        print(f"{char}: {category} - {name}")

test_unicode_category("Hello123!你好")
# H: Lu - LATIN CAPITAL LETTER H
# e: Ll - LATIN SMALL LETTER E
# 1: Nd - DIGIT ONE
# !: Po - EXCLAMATION MARK
# 你: Lo - CJK UNIFIED IDEOGRAPH-4F60

# Filter by category
def filter_by_category(text, category):
    """Extract characters of specific category"""
    return "".join(c for c in text if unicodedata.category(c) == category)

text = "Hello123World!你好"
print(filter_by_category(text, "Lu"))   # HW (uppercase)
print(filter_by_category(text, "Nd"))   # 123 (digits)
print(filter_by_category(text, "Lo"))   # 你好 (letters)

💡 Fuzzy String Matching

from difflib import SequenceMatcher, get_close_matches
import re

def similarity_ratio(str1, str2):
    """Calculate similarity between two strings"""
    matcher = SequenceMatcher(None, str1, str2)
    return matcher.ratio()

# Simple matching
str1 = "hello"
str2 = "hallo"
print(f"Similarity: {similarity_ratio(str1, str2):.2%}")
# Similarity: 80.00%

# Find close matches
word = "speling"
words = ["spelling", "selling", "smelling", "spelling"]
matches = get_close_matches(word, words, n=2, cutoff=0.6)
print(matches)  # ['spelling', 'selling']

# Levenshtein distance
def levenshtein_distance(str1, str2):
    """Calculate Levenshtein distance"""
    if len(str1) < len(str2):
        return levenshtein_distance(str2, str1)

    if len(str2) == 0:
        return len(str1)

    previous_row = range(len(str2) + 1)
    for i, c1 in enumerate(str1):
        current_row = [i + 1]
        for j, c2 in enumerate(str2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

print(levenshtein_distance("kitten", "sitting"))  # 3

🎨 Advanced Validation

import re
from functools import wraps

# Email validator (RFC 5322 simplified)
def is_valid_email(email):
    """Validate email format"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

# URL validator
def is_valid_url(url):
    """Validate URL format"""
    pattern = r'^https?://[^\s/$.?#].[^\s]*$'
    return re.match(pattern, url) is not None

# Strong password validator
def validate_password(password):
    """Detailed password validation"""
    issues = []

    if len(password) < 12:
        issues.append("At least 12 characters required")

    if not re.search(r'[a-z]', password):
        issues.append("Include lowercase letters")

    if not re.search(r'[A-Z]', password):
        issues.append("Include uppercase letters")

    if not re.search(r'\d', password):
        issues.append("Include numbers")

    if not re.search(r'[!@#$%^&*(),.?":{}|<>]', password):
        issues.append("Include special characters")

    return {
        "valid": len(issues) == 0,
        "issues": issues
    }

# Validator decorator
def validate(validator_func):
    """Decorator for validation"""
    def decorator(func):
        @wraps(func)
        def wrapper(value, *args, **kwargs):
            if not validator_func(value):
                raise ValueError(f"Invalid input: {value}")
            return func(value, *args, **kwargs)
        return wrapper
    return decorator

@validate(is_valid_email)
def send_email(email):
    return f"Email sent to {email}"

try:
    print(send_email("user@example.com"))
    print(send_email("invalid-email"))
except ValueError as e:
    print(e)

📊 Custom Validation Classes

class StringValidator:
    """Comprehensive string validator"""

    def __init__(self, value):
        self.value = value
        self.errors = []

    def min_length(self, length):
        """Check minimum length"""
        if len(self.value) < length:
            self.errors.append(f"Must be at least {length} characters")
        return self

    def max_length(self, length):
        """Check maximum length"""
        if len(self.value) > length:
            self.errors.append(f"Must be at most {length} characters")
        return self

    def match_pattern(self, pattern, message="Invalid format"):
        """Check regex pattern"""
        if not re.match(pattern, self.value):
            self.errors.append(message)
        return self

    def no_spaces(self):
        """Disallow spaces"""
        if " " in self.value:
            self.errors.append("No spaces allowed")
        return self

    def is_valid(self):
        """Return validation result"""
        return len(self.errors) == 0

    def get_errors(self):
        """Get error messages"""
        return self.errors

# Usage
validator = (StringValidator("admin123")
             .min_length(6)
             .max_length(20)
             .no_spaces()
             .match_pattern(r'^[a-zA-Z0-9_]+$', "Only alphanumeric and underscore"))

print(f"Valid: {validator.is_valid()}")
print(f"Errors: {validator.get_errors()}")

💡 Phonetic Matching

import re

def metaphone(word):
    """Simplified Metaphone algorithm"""
    word = word.upper()

    # Drop duplicates
    word = re.sub(r'(.)\1+', r'\1', word)

    # Remove non-alpha
    word = re.sub(r'[^A-Z]', '', word)

    # Replace patterns
    replacements = [
        (r'^KN', 'N'),
        (r'^WR', 'R'),
        (r'TCH', 'CH'),
        (r'DG', 'G'),
        (r'^H[AEIOUWY]', 'H'),
    ]

    for pattern, replacement in replacements:
        word = re.sub(pattern, replacement, word)

    return word

print(metaphone("knight"))  # Simplified phonetic
print(metaphone("knight") == metaphone("night"))

🔑 Key Takeaways

Concept	Remember
Unicode categories	Use `unicodedata.category()` for multilingual support
Fuzzy matching	Use difflib for approximate string matching
Levenshtein distance	Measure edit distance between strings
Validation decorators	Use for clean, reusable validation code
Fluent builders	Chain validation methods for readability

🔗 What's Next?

Learn advanced regular expressions.

Ready to practice? Challenges | Quiz

Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

Python Fastapi ReactJS Cloud

Resources

Blog & Articles GitHub Projects Video Tutorials

Ecosystem

Ojasa Mirai Site My Growth Learning Portal Community Discord

Twitter GitHub LinkedIn