
Python
While APIs provide structured data, not all websites offer them. Web scraping lets you extract data directly from HTML pages. It's a powerful technique for gathering information from websites, but always check the site's terms of service first.
Web scraping is the process of automatically extracting data from websites. You fetch HTML content and parse it to find and extract the information you need.
import requests
from bs4 import BeautifulSoup
# Fetch website
response = requests.get('https://example.com')
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data
titles = soup.find_all('h1')
for title in titles:
print(title.text)BeautifulSoup is the most popular Python library for parsing HTML:
# Installation
pip install beautifulsoup4from bs4 import BeautifulSoup
# HTML content as string
html_content = '''
<html>
<body>
<h1>Welcome to My Site</h1>
<p>This is a paragraph</p>
<div class="content">
<span id="title">Important Title</span>
</div>
</body>
</html>
'''
# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Find elements
title = soup.find('h1')
print(title.text) # Welcome to My Site
paragraph = soup.find('p')
print(paragraph.text) # This is a paragraph
# Find by id
important = soup.find(id='title')
print(important.text) # Important Title
# Find by class
content = soup.find(class_='content')
print(content.text) # Important Titlefrom bs4 import BeautifulSoup
import requests
html = '''
<html>
<body>
<article class="post">
<h2>Article 1</h2>
<p>Content 1</p>
</article>
<article class="post">
<h2>Article 2</h2>
<p>Content 2</p>
</article>
</body>
</html>
'''
soup = BeautifulSoup(html, 'html.parser')
# Find first matching element
first_article = soup.find('article')
print(first_article.h2.text) # Article 1
# Find by class
first_post = soup.find(class_='post')
print(first_post.p.text) # Content 1
# Find by attribute
link = soup.find('a', {'href': 'https://example.com'})from bs4 import BeautifulSoup
html = '''
<html>
<body>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</body>
</html>
'''
soup = BeautifulSoup(html, 'html.parser')
# Find all items
items = soup.find_all('li')
print(len(items)) # 3
# Get text from each
for item in items:
print(item.text)
# Item 1
# Item 2
# Item 3from bs4 import BeautifulSoup
html = '''
<html>
<body>
<a href="https://example.com" class="link" id="main-link">
Click Here
</a>
</body>
</html>
'''
soup = BeautifulSoup(html, 'html.parser')
link = soup.find('a')
# Get text
print(link.text) # Click Here
# Get attributes
print(link['href']) # https://example.com
print(link['class']) # ['link']
print(link.get('id')) # main-link
# Check if attribute exists
if link.get('target'):
print('Has target')
else:
print('No target')
# All attributes as dict
print(link.attrs) # {'href': 'https://example.com', 'class': ['link'], 'id': 'main-link'}Scraping product information from a website:
import requests
from bs4 import BeautifulSoup
# Fetch the page
response = requests.get('https://example-shop.com/products')
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Find all products
products = soup.find_all('div', class_='product')
for product in products:
# Extract information
name = product.find('h3', class_='product-name')
price = product.find('span', class_='price')
rating = product.find('div', class_='rating')
if name and price:
print(f"Name: {name.text}")
print(f"Price: {price.text}")
if rating:
print(f"Rating: {rating.text}")
print("---")import requests
from bs4 import BeautifulSoup
import json
response = requests.get('https://example.com/articles')
soup = BeautifulSoup(response.content, 'html.parser')
# Extract articles into list of dicts
articles = []
for article_elem in soup.find_all('article'):
article = {
'title': article_elem.find('h2').text.strip(),
'author': article_elem.find('span', class_='author').text.strip(),
'date': article_elem.find('time')['datetime'],
'url': article_elem.find('a')['href'],
'summary': article_elem.find('p').text.strip()
}
articles.append(article)
# Save as JSON
with open('articles.json', 'w') as f:
json.dump(articles, f, indent=2)
# Print results
for article in articles:
print(f"Title: {article['title']}")
print(f"Author: {article['author']}")
print(f"Date: {article['date']}")import requests
from bs4 import BeautifulSoup
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def scrape_website(url, timeout=5):
"""Scrape website with error handling"""
try:
# Fetch with timeout
response = requests.get(url, timeout=timeout)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
logger.info(f"Successfully fetched {url}")
# Extract data
titles = []
for h1 in soup.find_all('h1'):
title = h1.text.strip()
if title: # Only non-empty
titles.append(title)
return titles
except requests.exceptions.Timeout:
logger.error(f"Request to {url} timed out")
return None
except requests.exceptions.ConnectionError:
logger.error(f"Failed to connect to {url}")
return None
except requests.exceptions.HTTPError as e:
logger.error(f"HTTP error: {e.response.status_code}")
return None
except Exception as e:
logger.exception(f"Unexpected error scraping {url}")
return None
# Usage
titles = scrape_website('https://example.com')
if titles:
for title in titles:
print(title)Always be a good citizen when scraping:
import requests
from bs4 import BeautifulSoup
import time
def scrape_multiple_pages(urls, delay=2):
"""Scrape multiple pages with delay between requests"""
results = []
for i, url in enumerate(urls):
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data
data = {
'url': url,
'title': soup.find('h1').text if soup.find('h1') else 'N/A'
}
results.append(data)
print(f"Scraped {i+1}/{len(urls)}: {url}")
except Exception as e:
print(f"Error scraping {url}: {e}")
# Delay between requests (be respectful!)
if i < len(urls) - 1:
time.sleep(delay)
return results
# Usage
urls = [
'https://example.com/page1',
'https://example.com/page2',
'https://example.com/page3'
]
results = scrape_multiple_pages(urls, delay=3)import requests
from bs4 import BeautifulSoup
# 1. Use appropriate headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get('https://example.com', headers=headers, timeout=5)
# 2. Check robots.txt
robots_response = requests.get('https://example.com/robots.txt')
print(robots_response.text)
# 3. Check terms of service
# Always review the website's ToS before scraping
# 4. Use session for multiple requests
session = requests.Session()
session.headers.update(headers)
page1 = session.get('https://example.com/page1')
page2 = session.get('https://example.com/page2') # Reuses connection
# 5. Implement backoff for errors
def scrape_with_backoff(url, max_retries=3, base_wait=1):
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException:
if attempt < max_retries - 1:
wait_time = base_wait * (2 ** attempt)
print(f"Retrying in {wait_time}s...")
time.sleep(wait_time)
return None
# 6. Store data responsibly
import json
data = [
{'url': 'example.com', 'title': 'Example'},
{'url': 'example2.com', 'title': 'Example 2'}
]
with open('scraped_data.json', 'w') as f:
json.dump(data, f, indent=2)import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime
class WebScraper:
"""Professional web scraper with logging and error handling"""
def __init__(self, headers=None):
self.session = requests.Session()
self.headers = headers or {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
self.session.headers.update(self.headers)
def fetch_page(self, url, timeout=5):
"""Fetch a single page"""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_text(self, element, selector):
"""Safely extract text using CSS selector"""
if not element:
return None
found = element.select_one(selector)
return found.text.strip() if found else None
def scrape_articles(self, url):
"""Scrape articles from a news site"""
soup = self.fetch_page(url)
if not soup:
return []
articles = []
for item in soup.find_all('article'):
article = {
'title': self.extract_text(item, 'h2'),
'author': self.extract_text(item, '.author'),
'date': self.extract_text(item, 'time'),
'link': item.find('a')['href'] if item.find('a') else None
}
# Only add valid articles
if article['title']:
articles.append(article)
return articles
def save_data(self, data, filename):
"""Save data to JSON file"""
with open(filename, 'w') as f:
json.dump(data, f, indent=2)
print(f"Saved {len(data)} items to {filename}")
# Usage
scraper = WebScraper()
articles = scraper.scrape_articles('https://news.example.com')
scraper.save_data(articles, 'articles.json')| Task | Method | Example |
|---|---|---|
| Find first element | `soup.find()` | `soup.find('div', class_='item')` |
| Find all elements | `soup.find_all()` | `soup.find_all('p')` |
| Get text | `.text` | `element.text` |
| Get attribute | `['attr']` or `.get()` | `link['href']` or `link.get('id')` |
| Navigate children | `.` operator | `article.h1.text` |
| CSS selector | `.select_one()` | `soup.select_one('div > p')` |
| Pagination | Loop with URL params | `for page in range(1, 10):` |
| Rate limiting | `time.sleep()` | `time.sleep(2)` |
You've now learned the complete journey of APIs and data extraction:
1. REST API Basics
2. HTTP Requests
3. Status Codes
4. JSON Serialization
5. Error Handling
6. API Authentication
7. Rate Limiting
8. Building APIs
9. Web Scraping
Next steps: Combine these skills to build real-world applications that consume and produce data!
Ready to practice? Try challenges or explore resources
Resources
Ojasa Mirai
Master AI-powered development skills through structured learning, real projects, and verified credentials. Whether you're upskilling your team or launching your career, we deliver the skills companies actually need.
Learn Deep • Build Real • Verify Skills • Launch Forward