Ojasa Mirai

Python

Learning Level

String Basics & Creation String Indexing & Slicing Common String Methods String Formatting Essentials Finding & Replacing Text String Testing & Validation Regular Expression Basics Text Splitting & Joining Practical String Projects

Python/String Manipulation/Practical String Projects

🚀 Practical String Projects — Production-Grade Solutions

Build sophisticated text processing systems for real-world applications.

🎯 Project 1: Log Parser

Parse and analyze application logs:

import re
from datetime import datetime
from collections import Counter

class LogParser:
    LOG_PATTERN = r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<module>\w+): (?P<message>.*)'

    def __init__(self, log_content):
        self.log_content = log_content
        self.entries = []
        self.parse()

    def parse(self):
        """Parse log entries"""
        for line in self.log_content.split('\n'):
            if not line.strip():
                continue

            match = re.match(self.LOG_PATTERN, line)
            if match:
                self.entries.append(match.groupdict())

    def filter_by_level(self, level):
        """Get logs of specific level"""
        return [e for e in self.entries if e['level'] == level]

    def get_stats(self):
        """Generate statistics"""
        levels = Counter(e['level'] for e in self.entries)
        modules = Counter(e['module'] for e in self.entries)

        return {
            'total_entries': len(self.entries),
            'by_level': dict(levels),
            'by_module': dict(modules)
        }

    def search(self, pattern):
        """Search in log messages"""
        return [e for e in self.entries
                if re.search(pattern, e['message'], re.IGNORECASE)]

# Usage
log_content = """2026-02-20 10:30:45 [INFO] database: Connection established
2026-02-20 10:30:46 [ERROR] api: Request timeout
2026-02-20 10:30:47 [WARNING] cache: Memory usage high
2026-02-20 10:30:48 [INFO] database: Query executed"""

parser = LogParser(log_content)
print(parser.get_stats())
print(parser.filter_by_level('ERROR'))

💡 Project 2: Data Extractor (HTML/JSON)

Extract structured data from text:

import re
import json
from html.parser import HTMLParser

class HTMLDataExtractor(HTMLParser):
    """Extract data from HTML"""

    def __init__(self):
        super().__init__()
        self.data = {}
        self.current_tag = None
        self.current_content = ""

    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        self.current_content = ""

    def handle_data(self, data):
        self.current_content += data

    def handle_endtag(self, tag):
        if self.current_content.strip():
            if tag not in self.data:
                self.data[tag] = []
            self.data[tag].append(self.current_content.strip())

# JSON data extraction
def extract_json_values(json_str, path):
    """Extract values from JSON using dot notation"""
    try:
        data = json.loads(json_str)
        keys = path.split('.')

        for key in keys:
            if isinstance(data, dict):
                data = data.get(key)
            elif isinstance(data, list):
                # Handle array indices like [0]
                match = re.match(r'(\w+)\[(\d+)\]', key)
                if match:
                    key, index = match.groups()
                    data = data[int(index)]
                else:
                    break

        return data
    except:
        return None

# Usage
json_data = '{"user": {"name": "John", "emails": ["john@example.com"]}}'
print(extract_json_values(json_data, "user.name"))
# John

🎨 Project 3: Text Sanitizer & Cleaner

Clean and normalize text:

import re
import unicodedata

class TextSanitizer:
    """Comprehensive text sanitization"""

    def __init__(self, text):
        self.text = text
        self.original = text

    def remove_special_characters(self, keep_chars=""):
        """Remove special characters"""
        pattern = r'[^a-zA-Z0-9\s' + re.escape(keep_chars) + r']'
        self.text = re.sub(pattern, '', self.text)
        return self

    def normalize_whitespace(self):
        """Normalize spaces and newlines"""
        # Replace multiple spaces with single space
        self.text = re.sub(r' +', ' ', self.text)
        # Replace multiple newlines with single newline
        self.text = re.sub(r'\n\n+', '\n', self.text)
        return self

    def remove_diacritics(self):
        """Remove accents and diacritical marks"""
        nfd = unicodedata.normalize('NFD', self.text)
        self.text = ''.join(c for c in nfd if unicodedata.category(c) != 'Mn')
        return self

    def normalize_unicode(self):
        """Normalize Unicode to NFC"""
        self.text = unicodedata.normalize('NFC', self.text)
        return self

    def remove_urls(self):
        """Remove URLs"""
        self.text = re.sub(r'https?://\S+|www\.\S+', '', self.text)
        return self

    def remove_emails(self):
        """Remove email addresses"""
        self.text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', self.text)
        return self

    def remove_html(self):
        """Remove HTML tags"""
        self.text = re.sub(r'<[^>]+>', '', self.text)
        return self

    def get(self):
        """Get sanitized text"""
        return self.text.strip()

    def __str__(self):
        return self.get()

# Usage
text = "Check our website: https://example.com or email us at info@example.com\nContact: +1-555-1234"
sanitizer = TextSanitizer(text)
clean = (sanitizer
         .remove_urls()
         .remove_emails()
         .normalize_whitespace()
         .get())
print(clean)

📊 Project 4: Configuration Parser

Parse configuration files:

import re
from collections import defaultdict

class ConfigParser:
    """Parse configuration files"""

    def __init__(self):
        self.config = defaultdict(dict)

    def parse(self, content):
        """Parse INI-like configuration"""
        current_section = "default"

        for line in content.split('\n'):
            line = line.strip()

            # Skip empty lines and comments
            if not line or line.startswith('#') or line.startswith(';'):
                continue

            # Section header [section]
            if line.startswith('[') and line.endswith(']'):
                current_section = line[1:-1]
                continue

            # Key-value pair
            if '=' in line:
                key, value = line.split('=', 1)
                key = key.strip()
                value = value.strip().strip('"\'')
                self.config[current_section][key] = value

    def get(self, section, key, default=None):
        """Get configuration value"""
        return self.config.get(section, {}).get(key, default)

    def get_section(self, section):
        """Get entire section"""
        return dict(self.config.get(section, {}))

# Usage
config_text = """[database]
host = localhost
port = 5432
user = admin

[api]
debug = true
timeout = 30
"""

parser = ConfigParser()
parser.parse(config_text)
print(parser.get("database", "host"))  # localhost
print(parser.get_section("api"))        # {'debug': 'true', 'timeout': '30'}

💡 Project 5: Markdown Parser

Parse markdown to extract structure:

import re

class MarkdownParser:
    """Parse Markdown to extract structure"""

    def __init__(self, content):
        self.content = content
        self.headings = []
        self.links = []
        self.code_blocks = []
        self.parse()

    def parse(self):
        """Parse markdown content"""
        # Extract headings
        self.headings = re.findall(r'^(#+)\s+(.+)$', self.content, re.MULTILINE)

        # Extract links
        self.links = re.findall(r'\[([^\]]+)\]\(([^\)]+)\)', self.content)

        # Extract code blocks
        self.code_blocks = re.findall(r'```(.+?)```', self.content, re.DOTALL)

    def get_toc(self):
        """Generate table of contents"""
        toc = []
        for level, text in self.headings:
            indent = "  " * (len(level) - 1)
            toc.append(f"{indent}• {text}")
        return "\n".join(toc)

    def get_links(self):
        """Get all links"""
        return [{"text": text, "url": url} for text, url in self.links]

# Usage
md_content = """# Main Title
## Subsection 1
Check [Google](https://google.com) and [GitHub](https://github.com).

## Subsection 2

print("hello")

"""

parser = MarkdownParser(md_content)
print("Table of Contents:")
print(parser.get_toc())
print("\nLinks:")
print(parser.get_links())

🔑 Project 6: Diff Generator

Compare and highlight differences:

import difflib
import re

class TextDiff:
    """Generate text diffs"""

    def __init__(self, text1, text2):
        self.text1 = text1
        self.text2 = text2

    def get_line_diff(self):
        """Generate line-by-line diff"""
        lines1 = self.text1.splitlines()
        lines2 = self.text2.splitlines()

        diff = difflib.unified_diff(lines1, lines2, lineterm='')
        return '\n'.join(diff)

    def get_word_diff(self):
        """Generate word-by-word diff"""
        matcher = difflib.SequenceMatcher(None, self.text1, self.text2)
        result = []

        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == 'replace':
                result.append(f"- {self.text1[i1:i2]}")
                result.append(f"+ {self.text2[j1:j2]}")
            elif tag == 'delete':
                result.append(f"- {self.text1[i1:i2]}")
            elif tag == 'insert':
                result.append(f"+ {self.text2[j1:j2]}")

        return '\n'.join(result)

    def similarity(self):
        """Get similarity ratio"""
        matcher = difflib.SequenceMatcher(None, self.text1, self.text2)
        return matcher.ratio()

# Usage
text1 = "The quick brown fox"
text2 = "The quick blue fox"
diff = TextDiff(text1, text2)
print(diff.get_word_diff())
print(f"Similarity: {diff.similarity():.1%}")

🔑 Key Takeaways

Concept	Remember
Log parsing	Use structured regex with named groups
HTML parsing	Use HTMLParser or BeautifulSoup, not regex
Sanitization	Chain operations for robust cleaning
Configuration	INI-like format is simple and effective
Markdown	Simple regex sufficient for basic parsing

🔗 What's Next?

You've mastered advanced string manipulation! Ready for other Python topics?

Ready to practice? Challenges | Quiz

Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

Python Fastapi ReactJS Cloud

Resources

Blog & Articles GitHub Projects Video Tutorials

Ecosystem

Ojasa Mirai Site My Growth Learning Portal Community Discord

Twitter GitHub LinkedIn