Table Of Contents
Quick Fix: Basic HTML Parsing
from bs4 import BeautifulSoup
import requests
# Get HTML content
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'html.parser')
# Find elements
title = soup.find('title').text
first_paragraph = soup.find('p').text
all_links = soup.find_all('a')
# CSS selectors
articles = soup.select('article.post')
nav_links = soup.select('nav > ul > li > a')
# Get attributes
for link in all_links:
print(link.get('href'))
Advanced Selection Techniques
from bs4 import BeautifulSoup
# Multiple search criteria
soup.find_all(['h1', 'h2', 'h3']) # Multiple tags
soup.find_all(class_=['class1', 'class2']) # Multiple classes
soup.find_all(id=['id1', 'id2']) # Multiple IDs
# Using functions as filters
def has_data_attribute(tag):
return tag.has_attr('data-id')
elements = soup.find_all(has_data_attribute)
# Regular expressions
import re
soup.find_all(text=re.compile(r'\$\d+'))
soup.find_all('a', href=re.compile(r'^https://'))
# Limit results
first_5_divs = soup.find_all('div', limit=5)
# Navigate tree
parent = element.parent
siblings = element.find_next_siblings()
children = element.children
descendants = element.descendants
Data Extraction Patterns
# Extract structured data
def extract_product_data(soup):
products = []
for item in soup.select('.product-item'):
product = {
'name': item.select_one('.product-name').text.strip(),
'price': item.select_one('.price').text.strip(),
'image': item.select_one('img')['src'],
'link': item.select_one('a')['href'],
'rating': item.select_one('.rating')['data-rating'] if item.select_one('.rating') else None
}
products.append(product)
return products
# Handle missing elements safely
def safe_extract(element, selector, attribute=None):
found = element.select_one(selector)
if not found:
return None
if attribute:
return found.get(attribute)
return found.text.strip()
# Parse tables
table = soup.find('table')
headers = [th.text for th in table.find_all('th')]
rows = []
for tr in table.find_all('tr')[1:]: # Skip header row
row = [td.text.strip() for td in tr.find_all('td')]
rows.append(dict(zip(headers, row)))
# Clean text extraction
def clean_text(element):
# Remove script and style elements
for script in element(['script', 'style']):
script.decompose()
text = element.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
return ' '.join(chunk for chunk in chunks if chunk)
# Performance optimization
# Use lxml parser (faster)
soup = BeautifulSoup(html, 'lxml')
# Use SoupStrainer for partial parsing
from bs4 import SoupStrainer
parse_only = SoupStrainer("div", class_="content")
soup = BeautifulSoup(html, "lxml", parse_only=parse_only)
Works with BeautifulSoup 4.12+ and handles malformed HTML gracefully.
Share this article
Add Comment
No comments yet. Be the first to comment!