Table Of Contents
Quick Fix: Basic Debugging Setup
import logging
import traceback
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('scraping.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def debug_request(url):
try:
response = requests.get(url, timeout=10)
logger.info(f"Request to {url}: {response.status_code}")
logger.info(f"Response headers: {dict(response.headers)}")
if response.status_code != 200:
logger.error(f"HTTP Error: {response.status_code}")
logger.error(f"Response text: {response.text[:500]}")
return response
except Exception as e:
logger.error(f"Request failed: {e}")
logger.error(traceback.format_exc())
return None
Advanced Error Handling
from functools import wraps
import time
import random
def retry_on_failure(max_retries=3, delay=1):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed: {e}")
if attempt == max_retries - 1:
logger.error(f"All {max_retries} attempts failed")
raise
# Exponential backoff with jitter
sleep_time = delay * (2 ** attempt) + random.uniform(0, 1)
time.sleep(sleep_time)
return wrapper
return decorator
class ScrapingDebugger:
def __init__(self, driver):
self.driver = driver
self.screenshot_counter = 0
def take_debug_screenshot(self, name=None):
if not name:
name = f"debug_{self.screenshot_counter}"
filename = f"screenshots/{name}_{int(time.time())}.png"
os.makedirs('screenshots', exist_ok=True)
self.driver.save_screenshot(filename)
logger.info(f"Screenshot saved: {filename}")
self.screenshot_counter += 1
return filename
def log_page_info(self):
logger.info(f"Current URL: {self.driver.current_url}")
logger.info(f"Page title: {self.driver.title}")
logger.info(f"Page source length: {len(self.driver.page_source)}")
# Check for errors in console
logs = self.driver.get_log('browser')
if logs:
logger.warning("Browser console errors:")
for log in logs:
logger.warning(f" {log['level']}: {log['message']}")
def debug_element_selection(self, selector, by=By.CSS_SELECTOR):
try:
elements = self.driver.find_elements(by, selector)
logger.info(f"Selector '{selector}' found {len(elements)} elements")
if not elements:
# Try to find similar elements
similar_selectors = self.suggest_similar_selectors(selector)
logger.info(f"Similar selectors: {similar_selectors}")
return elements
except Exception as e:
logger.error(f"Element selection failed: {e}")
self.take_debug_screenshot(f"selector_error_{selector.replace(' ', '_')}")
return []
def suggest_similar_selectors(self, original_selector):
# Suggest alternative selectors
suggestions = []
if '.' in original_selector:
class_name = original_selector.replace('.', '')
suggestions.append(f"[class*='{class_name}']")
suggestions.append(f"*[class='{class_name}']")
if '#' in original_selector:
id_name = original_selector.replace('#', '')
suggestions.append(f"[id*='{id_name}']")
return suggestions
Network and Performance Debugging
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
class NetworkDebugger:
def __init__(self):
self.setup_driver_with_logging()
def setup_driver_with_logging(self):
caps = DesiredCapabilities.CHROME
caps['goog:loggingPrefs'] = {
'performance': 'ALL',
'browser': 'ALL'
}
options = webdriver.ChromeOptions()
options.add_argument('--enable-logging')
options.add_argument('--log-level=0')
self.driver = webdriver.Chrome(
options=options,
desired_capabilities=caps
)
def analyze_network_requests(self):
logs = self.driver.get_log('performance')
requests = []
responses = []
for entry in logs:
log = json.loads(entry['message'])['message']
if log['method'] == 'Network.requestWillBeSent':
requests.append({
'url': log['params']['request']['url'],
'method': log['params']['request']['method'],
'timestamp': log['params']['timestamp']
})
elif log['method'] == 'Network.responseReceived':
response = log['params']['response']
responses.append({
'url': response['url'],
'status': response['status'],
'mimeType': response['mimeType'],
'timestamp': log['params']['timestamp']
})
# Find failed requests
failed_requests = [r for r in responses if r['status'] >= 400]
if failed_requests:
logger.warning(f"Found {len(failed_requests)} failed requests:")
for req in failed_requests:
logger.warning(f" {req['status']}: {req['url']}")
return {'requests': requests, 'responses': responses}
def measure_page_load_time(self, url):
start_time = time.time()
self.driver.get(url)
# Wait for page to be ready
WebDriverWait(self.driver, 30).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
load_time = time.time() - start_time
logger.info(f"Page load time: {load_time:.2f} seconds")
return load_time
# Data validation and testing
class ScrapingValidator:
def __init__(self):
self.validation_errors = []
def validate_extracted_data(self, data, schema):
"""Validate scraped data against expected schema"""
if not data:
self.validation_errors.append("No data extracted")
return False
for item in data:
for field, requirements in schema.items():
if requirements.get('required', False) and field not in item:
self.validation_errors.append(f"Missing required field: {field}")
if field in item:
value = item[field]
# Type validation
expected_type = requirements.get('type')
if expected_type and not isinstance(value, expected_type):
self.validation_errors.append(
f"Field {field} has wrong type: {type(value)} vs {expected_type}"
)
# Format validation
if requirements.get('format') == 'url' and not value.startswith('http'):
self.validation_errors.append(f"Invalid URL format: {value}")
if requirements.get('format') == 'email' and '@' not in value:
self.validation_errors.append(f"Invalid email format: {value}")
return len(self.validation_errors) == 0
def compare_with_expected(self, scraped_data, expected_sample):
"""Compare scraped data structure with expected sample"""
if len(scraped_data) == 0:
logger.error("No data scraped")
return False
sample_item = scraped_data[0]
expected_keys = set(expected_sample.keys())
actual_keys = set(sample_item.keys())
missing_keys = expected_keys - actual_keys
extra_keys = actual_keys - expected_keys
if missing_keys:
logger.warning(f"Missing expected keys: {missing_keys}")
if extra_keys:
logger.info(f"Extra keys found: {extra_keys}")
return len(missing_keys) == 0
# Unit testing for scrapers
import unittest
class TestScraper(unittest.TestCase):
def setUp(self):
self.scraper = MyScraper()
def test_basic_extraction(self):
test_html = """
<div class="item">
<h2 class="title">Test Product</h2>
<span class="price">$19.99</span>
</div>
"""
result = self.scraper.parse_html(test_html)
self.assertEqual(result['title'], 'Test Product')
self.assertEqual(result['price'], '$19.99')
def test_missing_elements(self):
incomplete_html = "<div class='item'><h2>Title Only</h2></div>"
result = self.scraper.parse_html(incomplete_html)
self.assertIsNone(result.get('price'))
if __name__ == '__main__':
unittest.main()
Works with modern debugging tools and provides comprehensive error tracking.
Share this article
Add Comment
No comments yet. Be the first to comment!