Ojasa Mirai

Python

Learning Level

String Basics & Creation String Indexing & Slicing Common String Methods String Formatting Essentials Finding & Replacing Text String Testing & Validation Regular Expression Basics Text Splitting & Joining Practical String Projects

Python/String Manipulation/Regex Basics

🎯 Regular Expression Basics — Advanced Techniques

Master complex patterns, lookahead/lookbehind assertions, and regex optimization.

🎯 Lookahead & Lookbehind Assertions

import re

# Positive lookahead (?=...)
# Match "test" only if followed by ".txt"
text = "test.txt file.doc"
matches = re.findall(r'\w+(?=\.txt)', text)
print(matches)  # ['test']

# Negative lookahead (?!...)
# Match number NOT followed by "%"
text = "Price: 100 items, 50% discount"
matches = re.findall(r'\d+(?!%)', text)
print(matches)  # ['100', '50'] - gets both but need filtering

# Better negative lookahead
matches = re.findall(r'\d+(?=%)', text)
print(matches)  # ['50']

# Positive lookbehind (?<=...)
# Match price only if preceded by "$"
text = "$100 costs 100 euros"
matches = re.findall(r'(?<=\$)\d+', text)
print(matches)  # ['100']

# Negative lookbehind (?<!...)
# Match digit NOT preceded by dollar
matches = re.findall(r'(?<!\$)\d+', text)
print(matches)  # ['100']

# Complex example: Extract variables NOT in comments
code = "x = 5  # var = 10"
matches = re.findall(r'(?<!#.*)\b\w+\s*=', code)

💡 Named Groups & Backreferences

import re

# Named groups for clarity
pattern = r'(?P<name>\w+):(?P<value>\d+)'
text = "age:30 score:95"

for match in re.finditer(pattern, text):
    print(f"{match.group('name')} = {match.group('value')}")

# Backreferences - match repeated groups
# Find repeated words
text = "hello hello world world test"
pattern = r'\b(\w+)\s+\1\b'
matches = re.findall(pattern, text)
print(matches)  # ['hello', 'world']

# HTML tag matching with backreference
html = "<div>content</div>"
pattern = r'<(\w+)>.*?</\1>'
if re.search(pattern, html):
    print("Valid HTML tag")

# Replace with captured groups
text = "John Smith"
result = re.sub(r'(\w+)\s+(\w+)', r'\2, \1', text)
print(result)  # Smith, John

# Named group backreference
pattern = r'(?P<quote>["\']).*?(?P=quote)'
texts = ['He said "hello"', "She said 'goodbye'", 'Mismatched "quote\'']
for text in texts:
    if re.search(pattern, text):
        print(f"Valid: {text}")
    else:
        print(f"Invalid: {text}")

🎨 Non-Capturing Groups

import re

# Capturing group (memory cost)
text = "color colorless"
matches = re.findall(r'(color)(less)?', text)
print(matches)  # [('color', ''), ('color', 'less')]

# Non-capturing group (?:...) - no memory cost
matches = re.findall(r'(?:color)(?:less)?', text)
print(matches)  # ['color', 'colorless']

# Useful for grouping without capturing
pattern = r'(?:https?|ftp)://\S+'  # Match URLs without capturing protocol
text = "Visit https://example.com or ftp://files.example.com"
matches = re.findall(pattern, text)
print(matches)

# Alternation with non-capturing
email_or_phone = r'(?:\w+@\w+\.\w+|\d{3}-\d{3}-\d{4})'
contacts = re.findall(email_or_phone, "Email: john@example.com Phone: 555-123-4567")

📊 Advanced Quantifiers

import re

# Greedy vs Non-greedy
text = "<div>content1</div><div>content2</div>"

# Greedy (matches as much as possible)
greedy = re.findall(r'<div>.*</div>', text)
print(greedy)  # ['<div>content1</div><div>content2</div>']

# Non-greedy (matches as little as possible)
non_greedy = re.findall(r'<div>.*?</div>', text)
print(non_greedy)  # ['<div>content1</div>', '<div>content2</div>']

# Possessive quantifiers (atomic grouping) - avoid backtracking
# (Not available in Python re, use regex module instead)

# Conditional patterns
amount_pattern = r'(\d+)\s*(?:dollars?|USD)?'
texts = ["100", "100 dollars", "50 USD"]
for text in texts:
    match = re.match(amount_pattern, text)
    if match:
        print(f"Amount: {match.group(1)}")

💡 Regex Performance Optimization

import re
import timeit

text = "the quick brown fox jumps over the lazy dog " * 1000

# Method 1: Uncompiled regex
def method_uncompiled():
    return len(re.findall(r'\bthe\b', text))

# Method 2: Compiled regex
compiled_pattern = re.compile(r'\bthe\b')
def method_compiled():
    return len(compiled_pattern.findall(text))

# Method 3: Optimized pattern
def method_optimized():
    return len(re.findall(r'\bthe\b', text, re.IGNORECASE))

print("Uncompiled:", timeit.timeit(method_uncompiled, number=1000))
print("Compiled:", timeit.timeit(method_compiled, number=1000))

# Avoiding catastrophic backtracking
# BAD: (a+)+b matches "aaaaaaaaaaaaaaac" and backtracks heavily
# GOOD: a+b uses atomic grouping or non-backtracking patterns
bad_pattern = r'(a+)+b'
good_pattern = r'a+b'

# Test with problematic input
text = "a" * 30 + "c"
try:
    result = re.search(bad_pattern, text, timeout=1)
except:
    print("Bad pattern timed out!")

result = re.search(good_pattern, text)  # Instant

🎯 Word Boundaries & Anchors

import re

text = "The theater theater"

# Word boundary \b
matches = re.findall(r'\btheat\w+', text)
print(matches)  # ['theater', 'theater']

matches = re.findall(r'\bthe\w+', text, re.IGNORECASE)
print(matches)  # ['The', 'theater', 'theater']

# Negated word boundary \B
matches = re.findall(r'\Bthe\w*', text, re.IGNORECASE)
print(matches)  # ['theater'] (matches "the" inside words)

# String anchors
pattern = r'^start.*end$'
lines = [
    "start middle end",
    "begin start middle end finish",
    "start of line"
]

for line in lines:
    if re.match(pattern, line):
        print(f"Match: {line}")

🔑 Key Takeaways

Concept	Remember
Lookahead/behind	(?=...), (?!...), (?<=...), (?<!...)
Named groups	Use (?P<name>...) for clarity
Non-capturing	Use (?:...) to avoid overhead
Greedy vs lazy	Use `*?`, `+?`, `??` for non-greedy matching
Compile patterns	Use re.compile() for reused patterns

🔗 What's Next?

Learn advanced text splitting and joining.

Ready to practice? Challenges | Quiz

Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

Python Fastapi ReactJS Cloud

Resources

Blog & Articles GitHub Projects Video Tutorials

Ecosystem

Ojasa Mirai Site My Growth Learning Portal Community Discord

Twitter GitHub LinkedIn