Navigation

Python

How to Scrape APIs vs HTML Pages

Choose the right approach in 2025: API endpoints vs HTML scraping, performance comparison, and implementation strategies

Table Of Contents

Quick Fix: API vs HTML Decision Framework

import requests
from bs4 import BeautifulSoup
import time
import json

# Check if API is available
def check_for_api(base_url):
    api_endpoints = [
        f"{base_url}/api",
        f"{base_url}/api/v1", 
        f"{base_url}/api/v2",
        f"{base_url}/graphql",
        f"{base_url}/rest"
    ]
    
    for endpoint in api_endpoints:
        try:
            response = requests.get(endpoint, timeout=5)
            if response.status_code in [200, 401, 403]:  # API exists
                return endpoint
        except:
            continue
    
    return None

# Performance comparison
def compare_methods(url):
    # API approach
    start_time = time.time()
    api_data = requests.get(f"{url}/api/data").json()
    api_time = time.time() - start_time
    
    # HTML scraping approach  
    start_time = time.time()
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    html_data = extract_data_from_html(soup)
    html_time = time.time() - start_time
    
    print(f"API: {api_time:.2f}s, HTML: {html_time:.2f}s")
    return api_time < html_time

API-First Approach

class APIDataExtractor:
    def __init__(self, base_url, api_key=None):
        self.base_url = base_url
        self.session = requests.Session()
        
        if api_key:
            self.session.headers.update({
                'Authorization': f'Bearer {api_key}',
                'X-API-Key': api_key
            })
    
    def discover_endpoints(self):
        # Try common API discovery methods
        endpoints = []
        
        # OpenAPI/Swagger
        try:
            swagger = self.session.get(f"{self.base_url}/swagger.json")
            if swagger.status_code == 200:
                spec = swagger.json()
                endpoints.extend(spec.get('paths', {}).keys())
        except:
            pass
        
        # GraphQL introspection
        try:
            introspection_query = {
                "query": "{ __schema { types { name } } }"
            }
            response = self.session.post(
                f"{self.base_url}/graphql", 
                json=introspection_query
            )
            if response.status_code == 200:
                endpoints.append('/graphql')
        except:
            pass
        
        return endpoints
    
    def extract_data(self, endpoint, params=None):
        response = self.session.get(f"{self.base_url}/{endpoint}", params=params)
        
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:  # Rate limited
            time.sleep(60)
            return self.extract_data(endpoint, params)
        else:
            print(f"API error: {response.status_code}")
            return None
    
    def paginated_extraction(self, endpoint, page_param='page'):
        all_data = []
        page = 1
        
        while True:
            params = {page_param: page}
            data = self.extract_data(endpoint, params)
            
            if not data or not data.get('results'):
                break
            
            all_data.extend(data['results'])
            
            if not data.get('has_next', False):
                break
            
            page += 1
            time.sleep(0.5)  # Be polite
        
        return all_data

# GraphQL client
class GraphQLClient:
    def __init__(self, endpoint):
        self.endpoint = endpoint
        self.session = requests.Session()
    
    def query(self, query, variables=None):
        payload = {'query': query}
        if variables:
            payload['variables'] = variables
        
        response = self.session.post(self.endpoint, json=payload)
        return response.json()
    
    def get_schema(self):
        introspection = """
        query IntrospectionQuery {
            __schema {
                types {
                    name
                    fields {
                        name
                        type {
                            name
                        }
                    }
                }
            }
        }
        """
        return self.query(introspection)

Hybrid Approach: API + HTML

class HybridDataExtractor:
    def __init__(self, base_url):
        self.base_url = base_url
        self.session = requests.Session()
        self.api_available = self.check_api_availability()
    
    def check_api_availability(self):
        try:
            response = self.session.get(f"{self.base_url}/api", timeout=5)
            return response.status_code in [200, 401, 403]
        except:
            return False
    
    def extract_data(self, resource_type):
        if self.api_available:
            try:
                return self.extract_via_api(resource_type)
            except Exception as e:
                print(f"API failed: {e}, falling back to HTML")
                return self.extract_via_html(resource_type)
        else:
            return self.extract_via_html(resource_type)
    
    def extract_via_api(self, resource_type):
        response = self.session.get(f"{self.base_url}/api/{resource_type}")
        response.raise_for_status()
        return response.json()
    
    def extract_via_html(self, resource_type):
        response = self.session.get(f"{self.base_url}/{resource_type}")
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract structured data from HTML
        return self.parse_html_data(soup)
    
    def parse_html_data(self, soup):
        # Look for JSON-LD structured data
        json_ld = soup.find('script', {'type': 'application/ld+json'})
        if json_ld:
            return json.loads(json_ld.string)
        
        # Look for data attributes
        data_elements = soup.find_all(attrs={'data-json': True})
        if data_elements:
            return [json.loads(elem['data-json']) for elem in data_elements]
        
        # Manual parsing as fallback
        return self.manual_parse(soup)
    
    def manual_parse(self, soup):
        # Implement specific parsing logic
        items = []
        for element in soup.find_all(class_='item'):
            item = {
                'title': element.find(class_='title').text if element.find(class_='title') else None,
                'price': element.find(class_='price').text if element.find(class_='price') else None,
                'url': element.find('a')['href'] if element.find('a') else None
            }
            items.append(item)
        return items

# Network analysis to find hidden APIs
def discover_api_endpoints(url):
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    # Enable logging
    caps = DesiredCapabilities.CHROME
    caps['goog:loggingPrefs'] = {'performance': 'ALL'}
    
    driver = webdriver.Chrome(desired_capabilities=caps)
    driver.get(url)
    
    # Get network logs
    logs = driver.get_log('performance')
    api_calls = []
    
    for entry in logs:
        log = json.loads(entry['message'])['message']
        if log['method'] == 'Network.responseReceived':
            url = log['params']['response']['url']
            if '/api/' in url or url.endswith('.json'):
                api_calls.append(url)
    
    driver.quit()
    return list(set(api_calls))

# Choose best method based on data requirements
def choose_extraction_method(requirements):
    if requirements.get('real_time', False):
        return 'api'  # APIs typically provide real-time data
    elif requirements.get('large_volume', False):
        return 'api'  # APIs are more efficient for bulk data
    elif requirements.get('structured_data', False):
        return 'api'  # APIs provide structured data
    elif requirements.get('visual_content', False):
        return 'html'  # HTML needed for images, styling info
    else:
        return 'hybrid'  # Use both as needed

Works with REST APIs, GraphQL, and modern web applications.

Share this article

Add Comment

No comments yet. Be the first to comment!

More from Python