Navigation

Python

How to Handle CAPTCHA in Web Scraping

Approach CAPTCHA challenges ethically in 2025: detection, avoidance strategies, and manual intervention workflows

Table Of Contents

Quick Fix: CAPTCHA Detection and Manual Handling

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

def detect_captcha(driver):
    captcha_indicators = [
        "captcha",
        "recaptcha",
        "hcaptcha",
        "challenge",
        "verify",
        "robot"
    ]
    
    page_source = driver.page_source.lower()
    
    for indicator in captcha_indicators:
        if indicator in page_source:
            return True
    
    # Check for common CAPTCHA elements
    captcha_elements = driver.find_elements(By.CSS_SELECTOR, 
        '[class*="captcha"], [id*="captcha"], [class*="recaptcha"]'
    )
    
    return len(captcha_elements) > 0

def handle_captcha_manually(driver):
    if detect_captcha(driver):
        print("CAPTCHA detected! Please solve it manually.")
        print("Current URL:", driver.current_url)
        
        # Wait for user to solve CAPTCHA
        input("Press Enter after solving the CAPTCHA...")
        
        # Verify CAPTCHA is solved
        time.sleep(2)
        if detect_captcha(driver):
            print("CAPTCHA still present. Please try again.")
            return handle_captcha_manually(driver)
    
    return True

CAPTCHA Avoidance Strategies

import random
import time
from selenium.webdriver.common.action_chains import ActionChains

class CaptchaAvoider:
    def __init__(self, driver):
        self.driver = driver
        self.session_requests = 0
        self.last_request_time = time.time()
    
    def human_like_behavior(self):
        # Random mouse movements
        actions = ActionChains(self.driver)
        
        # Move mouse randomly
        for _ in range(3):
            x = random.randint(100, 800)
            y = random.randint(100, 600)
            actions.move_by_offset(x, y)
            actions.perform()
            time.sleep(random.uniform(0.1, 0.3))
        
        # Random scrolling
        scroll_amount = random.randint(200, 800)
        self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
        time.sleep(random.uniform(0.5, 1.5))
    
    def pace_requests(self):
        # Limit requests per session
        self.session_requests += 1
        
        if self.session_requests > 50:
            print("Taking a longer break to avoid detection...")
            time.sleep(random.uniform(300, 600))  # 5-10 minutes
            self.session_requests = 0
        
        # Random delays between requests
        elapsed = time.time() - self.last_request_time
        min_delay = random.uniform(2, 5)
        
        if elapsed < min_delay:
            time.sleep(min_delay - elapsed)
        
        self.last_request_time = time.time()
    
    def rotate_session(self):
        # Clear cookies and restart session
        self.driver.delete_all_cookies()
        
        # Change user agent
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
        ]
        
        self.driver.execute_script(
            f"Object.defineProperty(navigator, 'userAgent', {{get: () => '{random.choice(user_agents)}'}});"
        )

# Polite scraping to avoid CAPTCHAs
class PoliteScraper:
    def __init__(self):
        self.request_count = 0
        self.start_time = time.time()
    
    def should_take_break(self):
        # Take break after many requests
        if self.request_count > 100:
            return True
        
        # Take break after long session
        if time.time() - self.start_time > 3600:  # 1 hour
            return True
        
        return False
    
    def take_break(self):
        break_time = random.uniform(600, 1800)  # 10-30 minutes
        print(f"Taking a {break_time/60:.1f} minute break...")
        time.sleep(break_time)
        
        self.request_count = 0
        self.start_time = time.time()

CAPTCHA Service Integration (Educational)

# Note: Use CAPTCHA solving services ethically and legally
import requests
import base64

class CaptchaSolver:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "http://2captcha.com"
    
    def solve_image_captcha(self, image_path):
        # This is for educational purposes only
        # Always respect website terms of service
        
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode()
        
        # Submit CAPTCHA
        submit_data = {
            'key': self.api_key,
            'method': 'base64',
            'body': image_data
        }
        
        response = requests.post(f"{self.base_url}/in.php", data=submit_data)
        
        if response.text.startswith('OK|'):
            captcha_id = response.text.split('|')[1]
            return self.get_captcha_result(captcha_id)
        
        return None
    
    def get_captcha_result(self, captcha_id):
        # Wait for solution
        for _ in range(30):  # Wait up to 150 seconds
            time.sleep(5)
            
            response = requests.get(
                f"{self.base_url}/res.php",
                params={'key': self.api_key, 'action': 'get', 'id': captcha_id}
            )
            
            if response.text == 'CAPCHA_NOT_READY':
                continue
            elif response.text.startswith('OK|'):
                return response.text.split('|')[1]
            else:
                break
        
        return None

# Ethical CAPTCHA handling workflow
class EthicalCaptchaHandler:
    def __init__(self):
        self.captcha_encounters = 0
        self.max_captcha_attempts = 3
    
    def handle_captcha_encounter(self, driver):
        self.captcha_encounters += 1
        
        if self.captcha_encounters > self.max_captcha_attempts:
            print("Too many CAPTCHAs encountered. Stopping scraping.")
            print("Consider:")
            print("1. Reducing request frequency")
            print("2. Using different IP addresses")
            print("3. Contacting website for API access")
            return False
        
        print(f"CAPTCHA encountered (#{self.captcha_encounters})")
        
        # Manual solving
        choice = input("Solve manually (m) or skip (s)? ").lower()
        
        if choice == 'm':
            return self.manual_solve(driver)
        else:
            print("Skipping this request...")
            return False
    
    def manual_solve(self, driver):
        print("Please solve the CAPTCHA in the browser window.")
        input("Press Enter when completed...")
        
        # Verify solution
        if not detect_captcha(driver):
            print("CAPTCHA solved successfully!")
            return True
        else:
            print("CAPTCHA still present. Please try again.")
            return self.manual_solve(driver)

Always respect website terms of service and consider API access instead of scraping.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python