How to Use Beautiful Soup for HTML Parsing

Quick Fix: Basic HTML Parsing
Advanced Selection Techniques
Data Extraction Patterns

Quick Fix: Basic HTML Parsing

from bs4 import BeautifulSoup
import requests

# Get HTML content
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'html.parser')

# Find elements
title = soup.find('title').text
first_paragraph = soup.find('p').text
all_links = soup.find_all('a')

# CSS selectors
articles = soup.select('article.post')
nav_links = soup.select('nav > ul > li > a')

# Get attributes
for link in all_links:
    print(link.get('href'))

Advanced Selection Techniques

from bs4 import BeautifulSoup

# Multiple search criteria
soup.find_all(['h1', 'h2', 'h3'])  # Multiple tags
soup.find_all(class_=['class1', 'class2'])  # Multiple classes
soup.find_all(id=['id1', 'id2'])  # Multiple IDs

# Using functions as filters
def has_data_attribute(tag):
    return tag.has_attr('data-id')

elements = soup.find_all(has_data_attribute)

# Regular expressions
import re
soup.find_all(text=re.compile(r'\$\d+'))
soup.find_all('a', href=re.compile(r'^https://'))

# Limit results
first_5_divs = soup.find_all('div', limit=5)

# Navigate tree
parent = element.parent
siblings = element.find_next_siblings()
children = element.children
descendants = element.descendants

Data Extraction Patterns

# Extract structured data
def extract_product_data(soup):
    products = []
    
    for item in soup.select('.product-item'):
        product = {
            'name': item.select_one('.product-name').text.strip(),
            'price': item.select_one('.price').text.strip(),
            'image': item.select_one('img')['src'],
            'link': item.select_one('a')['href'],
            'rating': item.select_one('.rating')['data-rating'] if item.select_one('.rating') else None
        }
        products.append(product)
    
    return products

# Handle missing elements safely
def safe_extract(element, selector, attribute=None):
    found = element.select_one(selector)
    if not found:
        return None
    if attribute:
        return found.get(attribute)
    return found.text.strip()

# Parse tables
table = soup.find('table')
headers = [th.text for th in table.find_all('th')]
rows = []

for tr in table.find_all('tr')[1:]:  # Skip header row
    row = [td.text.strip() for td in tr.find_all('td')]
    rows.append(dict(zip(headers, row)))

# Clean text extraction
def clean_text(element):
    # Remove script and style elements
    for script in element(['script', 'style']):
        script.decompose()
    
    text = element.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    return ' '.join(chunk for chunk in chunks if chunk)

# Performance optimization
# Use lxml parser (faster)
soup = BeautifulSoup(html, 'lxml')

# Use SoupStrainer for partial parsing
from bs4 import SoupStrainer
parse_only = SoupStrainer("div", class_="content")
soup = BeautifulSoup(html, "lxml", parse_only=parse_only)

Works with BeautifulSoup 4.12+ and handles malformed HTML gracefully.

Share this article

Navigation

How to Use Beautiful Soup for HTML Parsing

Table Of Contents

Quick Fix: Basic HTML Parsing

Advanced Selection Techniques

Data Extraction Patterns

Add Comment

More from Python

Navigation

Table Of Contents

Quick Fix: Basic HTML Parsing

Advanced Selection Techniques

Data Extraction Patterns

Comments

Add Comment

More from Python

Python Class Decorators vs Function Decorators: When to Use Which

How to Use enumerate() vs range(len())

Arrays: Why I Once Crashed Production with a Simple Loop

How to Handle Duplicate Rows in DataFrames

How to Handle Anti-Scraping Measures

How to Create Custom Iterators and Iterables