Table Of Contents
Quick Fix: Basic Anti-Detection Setup
import requests
from time import sleep
import random
# Rotate user agents
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
headers = {
'User-Agent': random.choice(user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# Add delays
sleep(random.uniform(1, 3))
response = requests.get(url, headers=headers)
Advanced Evasion Techniques
# Use sessions for cookies
session = requests.Session()
session.headers.update(headers)
# Handle rate limiting
from functools import wraps
import time
def rate_limit(calls=1, period=1):
def decorator(func):
last_called = [0.0]
@wraps(func)
def wrapper(*args, **kwargs):
elapsed = time.time() - last_called[0]
left_to_wait = period - elapsed
if left_to_wait > 0:
time.sleep(left_to_wait)
ret = func(*args, **kwargs)
last_called[0] = time.time()
return ret
return wrapper
return decorator
@rate_limit(calls=1, period=2)
def fetch_page(url):
return session.get(url)
# Proxy rotation
proxies = [
'http://proxy1.com:8000',
'http://proxy2.com:8000'
]
proxy = {'http': random.choice(proxies)}
response = requests.get(url, proxies=proxy, timeout=10)
# Selenium with stealth
from selenium import webdriver
from selenium_stealth import stealth
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
Handle Specific Protections
# Cloudflare bypass
import cloudscraper
scraper = cloudscraper.create_scraper()
response = scraper.get(url)
# Handle JavaScript challenges
from requests_html import HTMLSession
session = HTMLSession()
r = session.get(url)
r.html.render() # Execute JavaScript
# Captcha handling (manual intervention)
def handle_captcha(driver):
if "captcha" in driver.page_source.lower():
print("CAPTCHA detected. Please solve manually.")
input("Press Enter after solving...")
return driver.page_source
return None
# Respect robots.txt
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url("https://example.com/robots.txt")
rp.read()
if rp.can_fetch("*", url):
# Safe to scrape
response = requests.get(url)
# Session fingerprinting
def get_with_retry(url, max_retries=3):
for i in range(max_retries):
try:
response = session.get(url, timeout=10)
if response.status_code == 200:
return response
elif response.status_code == 429:
wait_time = int(response.headers.get('Retry-After', 60))
time.sleep(wait_time)
except Exception as e:
if i == max_retries - 1:
raise
time.sleep(2 ** i) # Exponential backoff
Works with modern anti-bot systems. Always respect website terms of service.
Share this article
Add Comment
No comments yet. Be the first to comment!