
Python
Build sophisticated text processing systems for real-world applications.
Parse and analyze application logs:
import re
from datetime import datetime
from collections import Counter
class LogParser:
LOG_PATTERN = r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<module>\w+): (?P<message>.*)'
def __init__(self, log_content):
self.log_content = log_content
self.entries = []
self.parse()
def parse(self):
"""Parse log entries"""
for line in self.log_content.split('\n'):
if not line.strip():
continue
match = re.match(self.LOG_PATTERN, line)
if match:
self.entries.append(match.groupdict())
def filter_by_level(self, level):
"""Get logs of specific level"""
return [e for e in self.entries if e['level'] == level]
def get_stats(self):
"""Generate statistics"""
levels = Counter(e['level'] for e in self.entries)
modules = Counter(e['module'] for e in self.entries)
return {
'total_entries': len(self.entries),
'by_level': dict(levels),
'by_module': dict(modules)
}
def search(self, pattern):
"""Search in log messages"""
return [e for e in self.entries
if re.search(pattern, e['message'], re.IGNORECASE)]
# Usage
log_content = """2026-02-20 10:30:45 [INFO] database: Connection established
2026-02-20 10:30:46 [ERROR] api: Request timeout
2026-02-20 10:30:47 [WARNING] cache: Memory usage high
2026-02-20 10:30:48 [INFO] database: Query executed"""
parser = LogParser(log_content)
print(parser.get_stats())
print(parser.filter_by_level('ERROR'))Extract structured data from text:
import re
import json
from html.parser import HTMLParser
class HTMLDataExtractor(HTMLParser):
"""Extract data from HTML"""
def __init__(self):
super().__init__()
self.data = {}
self.current_tag = None
self.current_content = ""
def handle_starttag(self, tag, attrs):
self.current_tag = tag
self.current_content = ""
def handle_data(self, data):
self.current_content += data
def handle_endtag(self, tag):
if self.current_content.strip():
if tag not in self.data:
self.data[tag] = []
self.data[tag].append(self.current_content.strip())
# JSON data extraction
def extract_json_values(json_str, path):
"""Extract values from JSON using dot notation"""
try:
data = json.loads(json_str)
keys = path.split('.')
for key in keys:
if isinstance(data, dict):
data = data.get(key)
elif isinstance(data, list):
# Handle array indices like [0]
match = re.match(r'(\w+)\[(\d+)\]', key)
if match:
key, index = match.groups()
data = data[int(index)]
else:
break
return data
except:
return None
# Usage
json_data = '{"user": {"name": "John", "emails": ["john@example.com"]}}'
print(extract_json_values(json_data, "user.name"))
# JohnClean and normalize text:
import re
import unicodedata
class TextSanitizer:
"""Comprehensive text sanitization"""
def __init__(self, text):
self.text = text
self.original = text
def remove_special_characters(self, keep_chars=""):
"""Remove special characters"""
pattern = r'[^a-zA-Z0-9\s' + re.escape(keep_chars) + r']'
self.text = re.sub(pattern, '', self.text)
return self
def normalize_whitespace(self):
"""Normalize spaces and newlines"""
# Replace multiple spaces with single space
self.text = re.sub(r' +', ' ', self.text)
# Replace multiple newlines with single newline
self.text = re.sub(r'\n\n+', '\n', self.text)
return self
def remove_diacritics(self):
"""Remove accents and diacritical marks"""
nfd = unicodedata.normalize('NFD', self.text)
self.text = ''.join(c for c in nfd if unicodedata.category(c) != 'Mn')
return self
def normalize_unicode(self):
"""Normalize Unicode to NFC"""
self.text = unicodedata.normalize('NFC', self.text)
return self
def remove_urls(self):
"""Remove URLs"""
self.text = re.sub(r'https?://\S+|www\.\S+', '', self.text)
return self
def remove_emails(self):
"""Remove email addresses"""
self.text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', self.text)
return self
def remove_html(self):
"""Remove HTML tags"""
self.text = re.sub(r'<[^>]+>', '', self.text)
return self
def get(self):
"""Get sanitized text"""
return self.text.strip()
def __str__(self):
return self.get()
# Usage
text = "Check our website: https://example.com or email us at info@example.com\nContact: +1-555-1234"
sanitizer = TextSanitizer(text)
clean = (sanitizer
.remove_urls()
.remove_emails()
.normalize_whitespace()
.get())
print(clean)Parse configuration files:
import re
from collections import defaultdict
class ConfigParser:
"""Parse configuration files"""
def __init__(self):
self.config = defaultdict(dict)
def parse(self, content):
"""Parse INI-like configuration"""
current_section = "default"
for line in content.split('\n'):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#') or line.startswith(';'):
continue
# Section header [section]
if line.startswith('[') and line.endswith(']'):
current_section = line[1:-1]
continue
# Key-value pair
if '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip().strip('"\'')
self.config[current_section][key] = value
def get(self, section, key, default=None):
"""Get configuration value"""
return self.config.get(section, {}).get(key, default)
def get_section(self, section):
"""Get entire section"""
return dict(self.config.get(section, {}))
# Usage
config_text = """[database]
host = localhost
port = 5432
user = admin
[api]
debug = true
timeout = 30
"""
parser = ConfigParser()
parser.parse(config_text)
print(parser.get("database", "host")) # localhost
print(parser.get_section("api")) # {'debug': 'true', 'timeout': '30'}Parse markdown to extract structure:
import re
class MarkdownParser:
"""Parse Markdown to extract structure"""
def __init__(self, content):
self.content = content
self.headings = []
self.links = []
self.code_blocks = []
self.parse()
def parse(self):
"""Parse markdown content"""
# Extract headings
self.headings = re.findall(r'^(#+)\s+(.+)$', self.content, re.MULTILINE)
# Extract links
self.links = re.findall(r'\[([^\]]+)\]\(([^\)]+)\)', self.content)
# Extract code blocks
self.code_blocks = re.findall(r'```(.+?)```', self.content, re.DOTALL)
def get_toc(self):
"""Generate table of contents"""
toc = []
for level, text in self.headings:
indent = " " * (len(level) - 1)
toc.append(f"{indent}• {text}")
return "\n".join(toc)
def get_links(self):
"""Get all links"""
return [{"text": text, "url": url} for text, url in self.links]
# Usage
md_content = """# Main Title
## Subsection 1
Check [Google](https://google.com) and [GitHub](https://github.com).
## Subsection 2print("hello")
"""
parser = MarkdownParser(md_content)
print("Table of Contents:")
print(parser.get_toc())
print("\nLinks:")
print(parser.get_links())Compare and highlight differences:
import difflib
import re
class TextDiff:
"""Generate text diffs"""
def __init__(self, text1, text2):
self.text1 = text1
self.text2 = text2
def get_line_diff(self):
"""Generate line-by-line diff"""
lines1 = self.text1.splitlines()
lines2 = self.text2.splitlines()
diff = difflib.unified_diff(lines1, lines2, lineterm='')
return '\n'.join(diff)
def get_word_diff(self):
"""Generate word-by-word diff"""
matcher = difflib.SequenceMatcher(None, self.text1, self.text2)
result = []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'replace':
result.append(f"- {self.text1[i1:i2]}")
result.append(f"+ {self.text2[j1:j2]}")
elif tag == 'delete':
result.append(f"- {self.text1[i1:i2]}")
elif tag == 'insert':
result.append(f"+ {self.text2[j1:j2]}")
return '\n'.join(result)
def similarity(self):
"""Get similarity ratio"""
matcher = difflib.SequenceMatcher(None, self.text1, self.text2)
return matcher.ratio()
# Usage
text1 = "The quick brown fox"
text2 = "The quick blue fox"
diff = TextDiff(text1, text2)
print(diff.get_word_diff())
print(f"Similarity: {diff.similarity():.1%}")| Concept | Remember |
|---|---|
| Log parsing | Use structured regex with named groups |
| HTML parsing | Use HTMLParser or BeautifulSoup, not regex |
| Sanitization | Chain operations for robust cleaning |
| Configuration | INI-like format is simple and effective |
| Markdown | Simple regex sufficient for basic parsing |
You've mastered advanced string manipulation! Ready for other Python topics?
Ready to practice? Challenges | Quiz
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward