Navigation

Python

How to Debug Web Scraping Scripts

Troubleshoot scraping issues efficiently in 2025: logging, error handling, browser dev tools, and testing strategies

Table Of Contents

Quick Fix: Basic Debugging Setup

import logging
import traceback
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraping.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

def debug_request(url):
    try:
        response = requests.get(url, timeout=10)
        logger.info(f"Request to {url}: {response.status_code}")
        logger.info(f"Response headers: {dict(response.headers)}")
        
        if response.status_code != 200:
            logger.error(f"HTTP Error: {response.status_code}")
            logger.error(f"Response text: {response.text[:500]}")
        
        return response
        
    except Exception as e:
        logger.error(f"Request failed: {e}")
        logger.error(traceback.format_exc())
        return None

Advanced Error Handling

from functools import wraps
import time
import random

def retry_on_failure(max_retries=3, delay=1):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    logger.warning(f"Attempt {attempt + 1} failed: {e}")
                    
                    if attempt == max_retries - 1:
                        logger.error(f"All {max_retries} attempts failed")
                        raise
                    
                    # Exponential backoff with jitter
                    sleep_time = delay * (2 ** attempt) + random.uniform(0, 1)
                    time.sleep(sleep_time)
            
        return wrapper
    return decorator

class ScrapingDebugger:
    def __init__(self, driver):
        self.driver = driver
        self.screenshot_counter = 0
    
    def take_debug_screenshot(self, name=None):
        if not name:
            name = f"debug_{self.screenshot_counter}"
        
        filename = f"screenshots/{name}_{int(time.time())}.png"
        os.makedirs('screenshots', exist_ok=True)
        
        self.driver.save_screenshot(filename)
        logger.info(f"Screenshot saved: {filename}")
        self.screenshot_counter += 1
        
        return filename
    
    def log_page_info(self):
        logger.info(f"Current URL: {self.driver.current_url}")
        logger.info(f"Page title: {self.driver.title}")
        logger.info(f"Page source length: {len(self.driver.page_source)}")
        
        # Check for errors in console
        logs = self.driver.get_log('browser')
        if logs:
            logger.warning("Browser console errors:")
            for log in logs:
                logger.warning(f"  {log['level']}: {log['message']}")
    
    def debug_element_selection(self, selector, by=By.CSS_SELECTOR):
        try:
            elements = self.driver.find_elements(by, selector)
            logger.info(f"Selector '{selector}' found {len(elements)} elements")
            
            if not elements:
                # Try to find similar elements
                similar_selectors = self.suggest_similar_selectors(selector)
                logger.info(f"Similar selectors: {similar_selectors}")
            
            return elements
            
        except Exception as e:
            logger.error(f"Element selection failed: {e}")
            self.take_debug_screenshot(f"selector_error_{selector.replace(' ', '_')}")
            return []
    
    def suggest_similar_selectors(self, original_selector):
        # Suggest alternative selectors
        suggestions = []
        
        if '.' in original_selector:
            class_name = original_selector.replace('.', '')
            suggestions.append(f"[class*='{class_name}']")
            suggestions.append(f"*[class='{class_name}']")
        
        if '#' in original_selector:
            id_name = original_selector.replace('#', '')
            suggestions.append(f"[id*='{id_name}']")
        
        return suggestions

Network and Performance Debugging

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json

class NetworkDebugger:
    def __init__(self):
        self.setup_driver_with_logging()
    
    def setup_driver_with_logging(self):
        caps = DesiredCapabilities.CHROME
        caps['goog:loggingPrefs'] = {
            'performance': 'ALL',
            'browser': 'ALL'
        }
        
        options = webdriver.ChromeOptions()
        options.add_argument('--enable-logging')
        options.add_argument('--log-level=0')
        
        self.driver = webdriver.Chrome(
            options=options,
            desired_capabilities=caps
        )
    
    def analyze_network_requests(self):
        logs = self.driver.get_log('performance')
        
        requests = []
        responses = []
        
        for entry in logs:
            log = json.loads(entry['message'])['message']
            
            if log['method'] == 'Network.requestWillBeSent':
                requests.append({
                    'url': log['params']['request']['url'],
                    'method': log['params']['request']['method'],
                    'timestamp': log['params']['timestamp']
                })
            
            elif log['method'] == 'Network.responseReceived':
                response = log['params']['response']
                responses.append({
                    'url': response['url'],
                    'status': response['status'],
                    'mimeType': response['mimeType'],
                    'timestamp': log['params']['timestamp']
                })
        
        # Find failed requests
        failed_requests = [r for r in responses if r['status'] >= 400]
        if failed_requests:
            logger.warning(f"Found {len(failed_requests)} failed requests:")
            for req in failed_requests:
                logger.warning(f"  {req['status']}: {req['url']}")
        
        return {'requests': requests, 'responses': responses}
    
    def measure_page_load_time(self, url):
        start_time = time.time()
        
        self.driver.get(url)
        
        # Wait for page to be ready
        WebDriverWait(self.driver, 30).until(
            lambda driver: driver.execute_script("return document.readyState") == "complete"
        )
        
        load_time = time.time() - start_time
        logger.info(f"Page load time: {load_time:.2f} seconds")
        
        return load_time

# Data validation and testing
class ScrapingValidator:
    def __init__(self):
        self.validation_errors = []
    
    def validate_extracted_data(self, data, schema):
        """Validate scraped data against expected schema"""
        
        if not data:
            self.validation_errors.append("No data extracted")
            return False
        
        for item in data:
            for field, requirements in schema.items():
                if requirements.get('required', False) and field not in item:
                    self.validation_errors.append(f"Missing required field: {field}")
                
                if field in item:
                    value = item[field]
                    
                    # Type validation
                    expected_type = requirements.get('type')
                    if expected_type and not isinstance(value, expected_type):
                        self.validation_errors.append(
                            f"Field {field} has wrong type: {type(value)} vs {expected_type}"
                        )
                    
                    # Format validation
                    if requirements.get('format') == 'url' and not value.startswith('http'):
                        self.validation_errors.append(f"Invalid URL format: {value}")
                    
                    if requirements.get('format') == 'email' and '@' not in value:
                        self.validation_errors.append(f"Invalid email format: {value}")
        
        return len(self.validation_errors) == 0
    
    def compare_with_expected(self, scraped_data, expected_sample):
        """Compare scraped data structure with expected sample"""
        
        if len(scraped_data) == 0:
            logger.error("No data scraped")
            return False
        
        sample_item = scraped_data[0]
        expected_keys = set(expected_sample.keys())
        actual_keys = set(sample_item.keys())
        
        missing_keys = expected_keys - actual_keys
        extra_keys = actual_keys - expected_keys
        
        if missing_keys:
            logger.warning(f"Missing expected keys: {missing_keys}")
        
        if extra_keys:
            logger.info(f"Extra keys found: {extra_keys}")
        
        return len(missing_keys) == 0

# Unit testing for scrapers
import unittest

class TestScraper(unittest.TestCase):
    def setUp(self):
        self.scraper = MyScraper()
    
    def test_basic_extraction(self):
        test_html = """
        <div class="item">
            <h2 class="title">Test Product</h2>
            <span class="price">$19.99</span>
        </div>
        """
        
        result = self.scraper.parse_html(test_html)
        
        self.assertEqual(result['title'], 'Test Product')
        self.assertEqual(result['price'], '$19.99')
    
    def test_missing_elements(self):
        incomplete_html = "<div class='item'><h2>Title Only</h2></div>"
        
        result = self.scraper.parse_html(incomplete_html)
        
        self.assertIsNone(result.get('price'))
        
if __name__ == '__main__':
    unittest.main()

Works with modern debugging tools and provides comprehensive error tracking.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python