Ojasa Mirai

Python

Learning Level

String Basics & Creation String Indexing & Slicing Common String Methods String Formatting Essentials Finding & Replacing Text String Testing & Validation Regular Expression Basics Text Splitting & Joining Practical String Projects

Python/String Manipulation/Text Splitting Joining

✂️ Text Splitting & Joining — Advanced Parsing

Master regex-based splitting, custom delimiters, and handling edge cases in text parsing.

🎯 Regular Expression Splitting

import re

# Split by regex pattern
text = "apple123banana456cherry"
parts = re.split(r'\d+', text)
print(parts)  # ['apple', 'banana', 'cherry', '']

# Split with captured groups
text = "apple, banana; orange: grape"
parts = re.split(r'[,;:]', text)
print(parts)  # ['apple', ' banana', ' orange', ' grape']

# Split keeping delimiters
parts = re.split(r'([,;:])', text)
print(parts)  # ['apple', ',', ' banana', ';', ' orange', ':', ' grape']

# Split by whitespace variations
text = "word1  word2\t word3\nword4"
parts = re.split(r'\s+', text)
print(parts)  # ['word1', 'word2', 'word3', 'word4']

# Split with maxsplit
text = "a:b:c:d:e"
parts = re.split(r':', text, maxsplit=2)
print(parts)  # ['a', 'b', 'c:d:e']

# Lookahead split (don't consume delimiter)
text = "1 item, 2 items, 3 items"
parts = re.split(r'(?<=\d)\s+(?=item)', text)
print(parts)

💡 Custom Delimiter Handling

import re
from csv import reader
from io import StringIO

# CSV parsing with quoted fields
csv_line = 'John,"123 Main St, Apt 4",NYC,30'

# Simple approach (fails with commas in quotes)
simple = csv_line.split(',')
print(simple)  # ['John', '"123 Main St', ' Apt 4"', 'NYC', '30']

# Using CSV reader
csv_reader = reader(StringIO(csv_line))
proper = next(csv_reader)
print(proper)  # ['John', '123 Main St, Apt 4', 'NYC', '30']

# Regex approach for quoted values
pattern = r'"(?:[^"]|"")*"|[^,]+'
values = re.findall(pattern, csv_line)
print(values)

# Custom field separator class
class SmartSplitter:
    def __init__(self, text, sep=",", quote='"'):
        self.text = text
        self.sep = sep
        self.quote = quote

    def split(self):
        """Split respecting quoted sections"""
        fields = []
        current = ""
        in_quotes = False

        for char in self.text:
            if char == self.quote:
                in_quotes = not in_quotes
                current += char
            elif char == self.sep and not in_quotes:
                fields.append(current.strip())
                current = ""
            else:
                current += char

        fields.append(current.strip())
        return fields

splitter = SmartSplitter(csv_line)
print(splitter.split())

🎨 Advanced Joining Strategies

import re

# Join with condition
words = ["apple", "banana", "orange", "grape"]
result = ", ".join(words)
print(result)  # apple, banana, orange, grape

# Oxford comma
def oxford_join(items, sep=", ", last_sep=" and "):
    """Join with Oxford comma"""
    if len(items) == 0:
        return ""
    elif len(items) == 1:
        return str(items[0])
    elif len(items) == 2:
        return f"{items[0]}{last_sep}{items[1]}"
    else:
        return f"{sep.join(str(x) for x in items[:-1])}{sep[:-1]}{last_sep}{items[-1]}"

print(oxford_join(["red", "green", "blue"]))
# red, green, and blue

# Join with transformation
numbers = [1, 2, 3, 4, 5]
result = ", ".join(f"#{n}" for n in numbers)
print(result)  # #1, #2, #3, #4, #5

# Join by groups
def join_by_groups(items, group_size=2, sep="; "):
    """Join items in groups"""
    groups = []
    for i in range(0, len(items), group_size):
        group = " ".join(items[i:i+group_size])
        groups.append(group)
    return sep.join(groups)

print(join_by_groups(["a", "b", "c", "d", "e", "f"]))
# a b; c d; e f

📊 Path and URL Splitting

import re
from urllib.parse import urlparse, parse_qs

# File path splitting
path = "/home/user/documents/file.txt"
parts = path.split("/")
print(parts)  # ['', 'home', 'user', 'documents', 'file.txt']

# Using pathlib (better)
from pathlib import Path
path_obj = Path(path)
print(path_obj.parts)  # ('/', 'home', 'user', 'documents', 'file.txt')

# URL parsing
url = "https://user:pass@example.com:8080/path?key=value#section"
parsed = urlparse(url)
print(parsed.scheme)    # https
print(parsed.netloc)    # user:pass@example.com:8080
print(parsed.path)      # /path
print(parsed.query)     # key=value
print(parsed.fragment)  # section

# Query string parsing
query = "name=John&age=30&city=NYC"
params = parse_qs(query)
print(params)  # {'name': ['John'], 'age': ['30'], 'city': ['NYC']}

💡 Edge Case Handling

import re

# Empty strings and whitespace
def smart_split(text, delimiter=","):
    """Split while handling edge cases"""
    parts = text.split(delimiter)
    # Filter empty and strip whitespace
    return [p.strip() for p in parts if p.strip()]

print(smart_split("apple, , banana,,orange,"))
# ['apple', 'banana', 'orange']

# Nested structures
def split_nested(text, open_char="(", close_char=")"):
    """Split at top level only, ignoring nested content"""
    parts = []
    current = ""
    depth = 0

    for char in text:
        if char == open_char:
            depth += 1
        elif char == close_char:
            depth -= 1
        elif char == "," and depth == 0:
            parts.append(current)
            current = ""
            continue

        current += char

    parts.append(current)
    return parts

text = "func(a,b), func(x,y), value"
print(split_nested(text))
# ['func(a,b)', ' func(x,y)', ' value']

# Multi-line splitting with preservation
text = """line1
line2
line3"""

lines = text.splitlines()
print(lines)  # ['line1', 'line2', 'line3']

# Preserve empty lines
lines = text.split('\n')
print(lines)

🔑 Performance Considerations

import timeit
import re

text = "a,b,c,d,e," * 1000

# Method 1: str.split
def method_split():
    return text.split(",")

# Method 2: regex split
def method_regex():
    return re.split(r",", text)

# Method 3: list comprehension
def method_listcomp():
    return [s for s in text.split(",") if s]

print("str.split:", timeit.timeit(method_split, number=1000))
print("regex:", timeit.timeit(method_regex, number=1000))
print("listcomp:", timeit.timeit(method_listcomp, number=1000))
# str.split is fastest for simple delimiters

🔑 Key Takeaways

Concept	Remember
Regex split	Use for complex patterns and capture
CSV reader	Use for quoted fields and escaping
Custom splitter	Build for domain-specific parsing
Join techniques	Chain conditions for complex formatting
Edge cases	Handle empty strings, whitespace, nesting

🔗 What's Next?

Learn advanced practical string projects.

Ready to practice? Challenges | Quiz

Resources

Python Docs

Ojasa Mirai

Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.

Learn Deep • Build Real • Verify Skills • Launch Forward

Courses

Python Fastapi ReactJS Cloud

Resources

Blog & Articles GitHub Projects Video Tutorials

Ecosystem

Ojasa Mirai Site My Growth Learning Portal Community Discord

Twitter GitHub LinkedIn