Navigation

Python

How to Handle Anti-Scraping Measures

Bypass common anti-scraping techniques ethically in 2025: rate limiting, headers, proxies, and browser fingerprinting

Table Of Contents

Quick Fix: Basic Anti-Detection Setup

import requests
from time import sleep
import random

# Rotate user agents
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]

headers = {
    'User-Agent': random.choice(user_agents),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

# Add delays
sleep(random.uniform(1, 3))

response = requests.get(url, headers=headers)

Advanced Evasion Techniques

# Use sessions for cookies
session = requests.Session()
session.headers.update(headers)

# Handle rate limiting
from functools import wraps
import time

def rate_limit(calls=1, period=1):
    def decorator(func):
        last_called = [0.0]
        
        @wraps(func)
        def wrapper(*args, **kwargs):
            elapsed = time.time() - last_called[0]
            left_to_wait = period - elapsed
            if left_to_wait > 0:
                time.sleep(left_to_wait)
            ret = func(*args, **kwargs)
            last_called[0] = time.time()
            return ret
        return wrapper
    return decorator

@rate_limit(calls=1, period=2)
def fetch_page(url):
    return session.get(url)

# Proxy rotation
proxies = [
    'http://proxy1.com:8000',
    'http://proxy2.com:8000'
]

proxy = {'http': random.choice(proxies)}
response = requests.get(url, proxies=proxy, timeout=10)

# Selenium with stealth
from selenium import webdriver
from selenium_stealth import stealth

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(options=options)

stealth(driver,
    languages=["en-US", "en"],
    vendor="Google Inc.",
    platform="Win32",
    webgl_vendor="Intel Inc.",
    renderer="Intel Iris OpenGL Engine",
    fix_hairline=True,
)

Handle Specific Protections

# Cloudflare bypass
import cloudscraper

scraper = cloudscraper.create_scraper()
response = scraper.get(url)

# Handle JavaScript challenges
from requests_html import HTMLSession

session = HTMLSession()
r = session.get(url)
r.html.render()  # Execute JavaScript

# Captcha handling (manual intervention)
def handle_captcha(driver):
    if "captcha" in driver.page_source.lower():
        print("CAPTCHA detected. Please solve manually.")
        input("Press Enter after solving...")
        return driver.page_source
    return None

# Respect robots.txt
from urllib.robotparser import RobotFileParser

rp = RobotFileParser()
rp.set_url("https://example.com/robots.txt")
rp.read()

if rp.can_fetch("*", url):
    # Safe to scrape
    response = requests.get(url)

# Session fingerprinting
def get_with_retry(url, max_retries=3):
    for i in range(max_retries):
        try:
            response = session.get(url, timeout=10)
            if response.status_code == 200:
                return response
            elif response.status_code == 429:
                wait_time = int(response.headers.get('Retry-After', 60))
                time.sleep(wait_time)
        except Exception as e:
            if i == max_retries - 1:
                raise
            time.sleep(2 ** i)  # Exponential backoff

Works with modern anti-bot systems. Always respect website terms of service.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python