Table Of Contents
- Quick Fix: CAPTCHA Detection and Manual Handling
- CAPTCHA Avoidance Strategies
- CAPTCHA Service Integration (Educational)
Quick Fix: CAPTCHA Detection and Manual Handling
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
def detect_captcha(driver):
captcha_indicators = [
"captcha",
"recaptcha",
"hcaptcha",
"challenge",
"verify",
"robot"
]
page_source = driver.page_source.lower()
for indicator in captcha_indicators:
if indicator in page_source:
return True
# Check for common CAPTCHA elements
captcha_elements = driver.find_elements(By.CSS_SELECTOR,
'[class*="captcha"], [id*="captcha"], [class*="recaptcha"]'
)
return len(captcha_elements) > 0
def handle_captcha_manually(driver):
if detect_captcha(driver):
print("CAPTCHA detected! Please solve it manually.")
print("Current URL:", driver.current_url)
# Wait for user to solve CAPTCHA
input("Press Enter after solving the CAPTCHA...")
# Verify CAPTCHA is solved
time.sleep(2)
if detect_captcha(driver):
print("CAPTCHA still present. Please try again.")
return handle_captcha_manually(driver)
return True
CAPTCHA Avoidance Strategies
import random
import time
from selenium.webdriver.common.action_chains import ActionChains
class CaptchaAvoider:
def __init__(self, driver):
self.driver = driver
self.session_requests = 0
self.last_request_time = time.time()
def human_like_behavior(self):
# Random mouse movements
actions = ActionChains(self.driver)
# Move mouse randomly
for _ in range(3):
x = random.randint(100, 800)
y = random.randint(100, 600)
actions.move_by_offset(x, y)
actions.perform()
time.sleep(random.uniform(0.1, 0.3))
# Random scrolling
scroll_amount = random.randint(200, 800)
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
time.sleep(random.uniform(0.5, 1.5))
def pace_requests(self):
# Limit requests per session
self.session_requests += 1
if self.session_requests > 50:
print("Taking a longer break to avoid detection...")
time.sleep(random.uniform(300, 600)) # 5-10 minutes
self.session_requests = 0
# Random delays between requests
elapsed = time.time() - self.last_request_time
min_delay = random.uniform(2, 5)
if elapsed < min_delay:
time.sleep(min_delay - elapsed)
self.last_request_time = time.time()
def rotate_session(self):
# Clear cookies and restart session
self.driver.delete_all_cookies()
# Change user agent
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
]
self.driver.execute_script(
f"Object.defineProperty(navigator, 'userAgent', {{get: () => '{random.choice(user_agents)}'}});"
)
# Polite scraping to avoid CAPTCHAs
class PoliteScraper:
def __init__(self):
self.request_count = 0
self.start_time = time.time()
def should_take_break(self):
# Take break after many requests
if self.request_count > 100:
return True
# Take break after long session
if time.time() - self.start_time > 3600: # 1 hour
return True
return False
def take_break(self):
break_time = random.uniform(600, 1800) # 10-30 minutes
print(f"Taking a {break_time/60:.1f} minute break...")
time.sleep(break_time)
self.request_count = 0
self.start_time = time.time()
CAPTCHA Service Integration (Educational)
# Note: Use CAPTCHA solving services ethically and legally
import requests
import base64
class CaptchaSolver:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "http://2captcha.com"
def solve_image_captcha(self, image_path):
# This is for educational purposes only
# Always respect website terms of service
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode()
# Submit CAPTCHA
submit_data = {
'key': self.api_key,
'method': 'base64',
'body': image_data
}
response = requests.post(f"{self.base_url}/in.php", data=submit_data)
if response.text.startswith('OK|'):
captcha_id = response.text.split('|')[1]
return self.get_captcha_result(captcha_id)
return None
def get_captcha_result(self, captcha_id):
# Wait for solution
for _ in range(30): # Wait up to 150 seconds
time.sleep(5)
response = requests.get(
f"{self.base_url}/res.php",
params={'key': self.api_key, 'action': 'get', 'id': captcha_id}
)
if response.text == 'CAPCHA_NOT_READY':
continue
elif response.text.startswith('OK|'):
return response.text.split('|')[1]
else:
break
return None
# Ethical CAPTCHA handling workflow
class EthicalCaptchaHandler:
def __init__(self):
self.captcha_encounters = 0
self.max_captcha_attempts = 3
def handle_captcha_encounter(self, driver):
self.captcha_encounters += 1
if self.captcha_encounters > self.max_captcha_attempts:
print("Too many CAPTCHAs encountered. Stopping scraping.")
print("Consider:")
print("1. Reducing request frequency")
print("2. Using different IP addresses")
print("3. Contacting website for API access")
return False
print(f"CAPTCHA encountered (#{self.captcha_encounters})")
# Manual solving
choice = input("Solve manually (m) or skip (s)? ").lower()
if choice == 'm':
return self.manual_solve(driver)
else:
print("Skipping this request...")
return False
def manual_solve(self, driver):
print("Please solve the CAPTCHA in the browser window.")
input("Press Enter when completed...")
# Verify solution
if not detect_captcha(driver):
print("CAPTCHA solved successfully!")
return True
else:
print("CAPTCHA still present. Please try again.")
return self.manual_solve(driver)
Always respect website terms of service and consider API access instead of scraping.
Share this article
Add Comment
No comments yet. Be the first to comment!