
Python
Master regex-based splitting, custom delimiters, and handling edge cases in text parsing.
import re
# Split by regex pattern
text = "apple123banana456cherry"
parts = re.split(r'\d+', text)
print(parts) # ['apple', 'banana', 'cherry', '']
# Split with captured groups
text = "apple, banana; orange: grape"
parts = re.split(r'[,;:]', text)
print(parts) # ['apple', ' banana', ' orange', ' grape']
# Split keeping delimiters
parts = re.split(r'([,;:])', text)
print(parts) # ['apple', ',', ' banana', ';', ' orange', ':', ' grape']
# Split by whitespace variations
text = "word1 word2\t word3\nword4"
parts = re.split(r'\s+', text)
print(parts) # ['word1', 'word2', 'word3', 'word4']
# Split with maxsplit
text = "a:b:c:d:e"
parts = re.split(r':', text, maxsplit=2)
print(parts) # ['a', 'b', 'c:d:e']
# Lookahead split (don't consume delimiter)
text = "1 item, 2 items, 3 items"
parts = re.split(r'(?<=\d)\s+(?=item)', text)
print(parts)import re
from csv import reader
from io import StringIO
# CSV parsing with quoted fields
csv_line = 'John,"123 Main St, Apt 4",NYC,30'
# Simple approach (fails with commas in quotes)
simple = csv_line.split(',')
print(simple) # ['John', '"123 Main St', ' Apt 4"', 'NYC', '30']
# Using CSV reader
csv_reader = reader(StringIO(csv_line))
proper = next(csv_reader)
print(proper) # ['John', '123 Main St, Apt 4', 'NYC', '30']
# Regex approach for quoted values
pattern = r'"(?:[^"]|"")*"|[^,]+'
values = re.findall(pattern, csv_line)
print(values)
# Custom field separator class
class SmartSplitter:
def __init__(self, text, sep=",", quote='"'):
self.text = text
self.sep = sep
self.quote = quote
def split(self):
"""Split respecting quoted sections"""
fields = []
current = ""
in_quotes = False
for char in self.text:
if char == self.quote:
in_quotes = not in_quotes
current += char
elif char == self.sep and not in_quotes:
fields.append(current.strip())
current = ""
else:
current += char
fields.append(current.strip())
return fields
splitter = SmartSplitter(csv_line)
print(splitter.split())import re
# Join with condition
words = ["apple", "banana", "orange", "grape"]
result = ", ".join(words)
print(result) # apple, banana, orange, grape
# Oxford comma
def oxford_join(items, sep=", ", last_sep=" and "):
"""Join with Oxford comma"""
if len(items) == 0:
return ""
elif len(items) == 1:
return str(items[0])
elif len(items) == 2:
return f"{items[0]}{last_sep}{items[1]}"
else:
return f"{sep.join(str(x) for x in items[:-1])}{sep[:-1]}{last_sep}{items[-1]}"
print(oxford_join(["red", "green", "blue"]))
# red, green, and blue
# Join with transformation
numbers = [1, 2, 3, 4, 5]
result = ", ".join(f"#{n}" for n in numbers)
print(result) # #1, #2, #3, #4, #5
# Join by groups
def join_by_groups(items, group_size=2, sep="; "):
"""Join items in groups"""
groups = []
for i in range(0, len(items), group_size):
group = " ".join(items[i:i+group_size])
groups.append(group)
return sep.join(groups)
print(join_by_groups(["a", "b", "c", "d", "e", "f"]))
# a b; c d; e fimport re
from urllib.parse import urlparse, parse_qs
# File path splitting
path = "/home/user/documents/file.txt"
parts = path.split("/")
print(parts) # ['', 'home', 'user', 'documents', 'file.txt']
# Using pathlib (better)
from pathlib import Path
path_obj = Path(path)
print(path_obj.parts) # ('/', 'home', 'user', 'documents', 'file.txt')
# URL parsing
url = "https://user:pass@example.com:8080/path?key=value#section"
parsed = urlparse(url)
print(parsed.scheme) # https
print(parsed.netloc) # user:pass@example.com:8080
print(parsed.path) # /path
print(parsed.query) # key=value
print(parsed.fragment) # section
# Query string parsing
query = "name=John&age=30&city=NYC"
params = parse_qs(query)
print(params) # {'name': ['John'], 'age': ['30'], 'city': ['NYC']}import re
# Empty strings and whitespace
def smart_split(text, delimiter=","):
"""Split while handling edge cases"""
parts = text.split(delimiter)
# Filter empty and strip whitespace
return [p.strip() for p in parts if p.strip()]
print(smart_split("apple, , banana,,orange,"))
# ['apple', 'banana', 'orange']
# Nested structures
def split_nested(text, open_char="(", close_char=")"):
"""Split at top level only, ignoring nested content"""
parts = []
current = ""
depth = 0
for char in text:
if char == open_char:
depth += 1
elif char == close_char:
depth -= 1
elif char == "," and depth == 0:
parts.append(current)
current = ""
continue
current += char
parts.append(current)
return parts
text = "func(a,b), func(x,y), value"
print(split_nested(text))
# ['func(a,b)', ' func(x,y)', ' value']
# Multi-line splitting with preservation
text = """line1
line2
line3"""
lines = text.splitlines()
print(lines) # ['line1', 'line2', 'line3']
# Preserve empty lines
lines = text.split('\n')
print(lines)import timeit
import re
text = "a,b,c,d,e," * 1000
# Method 1: str.split
def method_split():
return text.split(",")
# Method 2: regex split
def method_regex():
return re.split(r",", text)
# Method 3: list comprehension
def method_listcomp():
return [s for s in text.split(",") if s]
print("str.split:", timeit.timeit(method_split, number=1000))
print("regex:", timeit.timeit(method_regex, number=1000))
print("listcomp:", timeit.timeit(method_listcomp, number=1000))
# str.split is fastest for simple delimiters| Concept | Remember |
|---|---|
| Regex split | Use for complex patterns and capture |
| CSV reader | Use for quoted fields and escaping |
| Custom splitter | Build for domain-specific parsing |
| Join techniques | Chain conditions for complex formatting |
| Edge cases | Handle empty strings, whitespace, nesting |
Learn advanced practical string projects.
Ready to practice? Challenges | Quiz
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward