
Python
Master complex patterns, lookahead/lookbehind assertions, and regex optimization.
import re
# Positive lookahead (?=...)
# Match "test" only if followed by ".txt"
text = "test.txt file.doc"
matches = re.findall(r'\w+(?=\.txt)', text)
print(matches) # ['test']
# Negative lookahead (?!...)
# Match number NOT followed by "%"
text = "Price: 100 items, 50% discount"
matches = re.findall(r'\d+(?!%)', text)
print(matches) # ['100', '50'] - gets both but need filtering
# Better negative lookahead
matches = re.findall(r'\d+(?=%)', text)
print(matches) # ['50']
# Positive lookbehind (?<=...)
# Match price only if preceded by "$"
text = "$100 costs 100 euros"
matches = re.findall(r'(?<=\$)\d+', text)
print(matches) # ['100']
# Negative lookbehind (?<!...)
# Match digit NOT preceded by dollar
matches = re.findall(r'(?<!\$)\d+', text)
print(matches) # ['100']
# Complex example: Extract variables NOT in comments
code = "x = 5 # var = 10"
matches = re.findall(r'(?<!#.*)\b\w+\s*=', code)import re
# Named groups for clarity
pattern = r'(?P<name>\w+):(?P<value>\d+)'
text = "age:30 score:95"
for match in re.finditer(pattern, text):
print(f"{match.group('name')} = {match.group('value')}")
# Backreferences - match repeated groups
# Find repeated words
text = "hello hello world world test"
pattern = r'\b(\w+)\s+\1\b'
matches = re.findall(pattern, text)
print(matches) # ['hello', 'world']
# HTML tag matching with backreference
html = "<div>content</div>"
pattern = r'<(\w+)>.*?</\1>'
if re.search(pattern, html):
print("Valid HTML tag")
# Replace with captured groups
text = "John Smith"
result = re.sub(r'(\w+)\s+(\w+)', r'\2, \1', text)
print(result) # Smith, John
# Named group backreference
pattern = r'(?P<quote>["\']).*?(?P=quote)'
texts = ['He said "hello"', "She said 'goodbye'", 'Mismatched "quote\'']
for text in texts:
if re.search(pattern, text):
print(f"Valid: {text}")
else:
print(f"Invalid: {text}")import re
# Capturing group (memory cost)
text = "color colorless"
matches = re.findall(r'(color)(less)?', text)
print(matches) # [('color', ''), ('color', 'less')]
# Non-capturing group (?:...) - no memory cost
matches = re.findall(r'(?:color)(?:less)?', text)
print(matches) # ['color', 'colorless']
# Useful for grouping without capturing
pattern = r'(?:https?|ftp)://\S+' # Match URLs without capturing protocol
text = "Visit https://example.com or ftp://files.example.com"
matches = re.findall(pattern, text)
print(matches)
# Alternation with non-capturing
email_or_phone = r'(?:\w+@\w+\.\w+|\d{3}-\d{3}-\d{4})'
contacts = re.findall(email_or_phone, "Email: john@example.com Phone: 555-123-4567")import re
# Greedy vs Non-greedy
text = "<div>content1</div><div>content2</div>"
# Greedy (matches as much as possible)
greedy = re.findall(r'<div>.*</div>', text)
print(greedy) # ['<div>content1</div><div>content2</div>']
# Non-greedy (matches as little as possible)
non_greedy = re.findall(r'<div>.*?</div>', text)
print(non_greedy) # ['<div>content1</div>', '<div>content2</div>']
# Possessive quantifiers (atomic grouping) - avoid backtracking
# (Not available in Python re, use regex module instead)
# Conditional patterns
amount_pattern = r'(\d+)\s*(?:dollars?|USD)?'
texts = ["100", "100 dollars", "50 USD"]
for text in texts:
match = re.match(amount_pattern, text)
if match:
print(f"Amount: {match.group(1)}")import re
import timeit
text = "the quick brown fox jumps over the lazy dog " * 1000
# Method 1: Uncompiled regex
def method_uncompiled():
return len(re.findall(r'\bthe\b', text))
# Method 2: Compiled regex
compiled_pattern = re.compile(r'\bthe\b')
def method_compiled():
return len(compiled_pattern.findall(text))
# Method 3: Optimized pattern
def method_optimized():
return len(re.findall(r'\bthe\b', text, re.IGNORECASE))
print("Uncompiled:", timeit.timeit(method_uncompiled, number=1000))
print("Compiled:", timeit.timeit(method_compiled, number=1000))
# Avoiding catastrophic backtracking
# BAD: (a+)+b matches "aaaaaaaaaaaaaaac" and backtracks heavily
# GOOD: a+b uses atomic grouping or non-backtracking patterns
bad_pattern = r'(a+)+b'
good_pattern = r'a+b'
# Test with problematic input
text = "a" * 30 + "c"
try:
result = re.search(bad_pattern, text, timeout=1)
except:
print("Bad pattern timed out!")
result = re.search(good_pattern, text) # Instantimport re
text = "The theater theater"
# Word boundary \b
matches = re.findall(r'\btheat\w+', text)
print(matches) # ['theater', 'theater']
matches = re.findall(r'\bthe\w+', text, re.IGNORECASE)
print(matches) # ['The', 'theater', 'theater']
# Negated word boundary \B
matches = re.findall(r'\Bthe\w*', text, re.IGNORECASE)
print(matches) # ['theater'] (matches "the" inside words)
# String anchors
pattern = r'^start.*end$'
lines = [
"start middle end",
"begin start middle end finish",
"start of line"
]
for line in lines:
if re.match(pattern, line):
print(f"Match: {line}")| Concept | Remember |
|---|---|
| Lookahead/behind | (?=...), (?!...), (?<=...), (?<!...) |
| Named groups | Use (?P<name>...) for clarity |
| Non-capturing | Use (?:...) to avoid overhead |
| Greedy vs lazy | Use `*?`, `+?`, `??` for non-greedy matching |
| Compile patterns | Use re.compile() for reused patterns |
Learn advanced text splitting and joining.
Ready to practice? Challenges | Quiz
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward