Table Of Contents
Quick Fix: API vs HTML Decision Framework
import requests
from bs4 import BeautifulSoup
import time
import json
# Check if API is available
def check_for_api(base_url):
api_endpoints = [
f"{base_url}/api",
f"{base_url}/api/v1",
f"{base_url}/api/v2",
f"{base_url}/graphql",
f"{base_url}/rest"
]
for endpoint in api_endpoints:
try:
response = requests.get(endpoint, timeout=5)
if response.status_code in [200, 401, 403]: # API exists
return endpoint
except:
continue
return None
# Performance comparison
def compare_methods(url):
# API approach
start_time = time.time()
api_data = requests.get(f"{url}/api/data").json()
api_time = time.time() - start_time
# HTML scraping approach
start_time = time.time()
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
html_data = extract_data_from_html(soup)
html_time = time.time() - start_time
print(f"API: {api_time:.2f}s, HTML: {html_time:.2f}s")
return api_time < html_time
API-First Approach
class APIDataExtractor:
def __init__(self, base_url, api_key=None):
self.base_url = base_url
self.session = requests.Session()
if api_key:
self.session.headers.update({
'Authorization': f'Bearer {api_key}',
'X-API-Key': api_key
})
def discover_endpoints(self):
# Try common API discovery methods
endpoints = []
# OpenAPI/Swagger
try:
swagger = self.session.get(f"{self.base_url}/swagger.json")
if swagger.status_code == 200:
spec = swagger.json()
endpoints.extend(spec.get('paths', {}).keys())
except:
pass
# GraphQL introspection
try:
introspection_query = {
"query": "{ __schema { types { name } } }"
}
response = self.session.post(
f"{self.base_url}/graphql",
json=introspection_query
)
if response.status_code == 200:
endpoints.append('/graphql')
except:
pass
return endpoints
def extract_data(self, endpoint, params=None):
response = self.session.get(f"{self.base_url}/{endpoint}", params=params)
if response.status_code == 200:
return response.json()
elif response.status_code == 429: # Rate limited
time.sleep(60)
return self.extract_data(endpoint, params)
else:
print(f"API error: {response.status_code}")
return None
def paginated_extraction(self, endpoint, page_param='page'):
all_data = []
page = 1
while True:
params = {page_param: page}
data = self.extract_data(endpoint, params)
if not data or not data.get('results'):
break
all_data.extend(data['results'])
if not data.get('has_next', False):
break
page += 1
time.sleep(0.5) # Be polite
return all_data
# GraphQL client
class GraphQLClient:
def __init__(self, endpoint):
self.endpoint = endpoint
self.session = requests.Session()
def query(self, query, variables=None):
payload = {'query': query}
if variables:
payload['variables'] = variables
response = self.session.post(self.endpoint, json=payload)
return response.json()
def get_schema(self):
introspection = """
query IntrospectionQuery {
__schema {
types {
name
fields {
name
type {
name
}
}
}
}
}
"""
return self.query(introspection)
Hybrid Approach: API + HTML
class HybridDataExtractor:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.api_available = self.check_api_availability()
def check_api_availability(self):
try:
response = self.session.get(f"{self.base_url}/api", timeout=5)
return response.status_code in [200, 401, 403]
except:
return False
def extract_data(self, resource_type):
if self.api_available:
try:
return self.extract_via_api(resource_type)
except Exception as e:
print(f"API failed: {e}, falling back to HTML")
return self.extract_via_html(resource_type)
else:
return self.extract_via_html(resource_type)
def extract_via_api(self, resource_type):
response = self.session.get(f"{self.base_url}/api/{resource_type}")
response.raise_for_status()
return response.json()
def extract_via_html(self, resource_type):
response = self.session.get(f"{self.base_url}/{resource_type}")
soup = BeautifulSoup(response.content, 'html.parser')
# Extract structured data from HTML
return self.parse_html_data(soup)
def parse_html_data(self, soup):
# Look for JSON-LD structured data
json_ld = soup.find('script', {'type': 'application/ld+json'})
if json_ld:
return json.loads(json_ld.string)
# Look for data attributes
data_elements = soup.find_all(attrs={'data-json': True})
if data_elements:
return [json.loads(elem['data-json']) for elem in data_elements]
# Manual parsing as fallback
return self.manual_parse(soup)
def manual_parse(self, soup):
# Implement specific parsing logic
items = []
for element in soup.find_all(class_='item'):
item = {
'title': element.find(class_='title').text if element.find(class_='title') else None,
'price': element.find(class_='price').text if element.find(class_='price') else None,
'url': element.find('a')['href'] if element.find('a') else None
}
items.append(item)
return items
# Network analysis to find hidden APIs
def discover_api_endpoints(url):
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# Enable logging
caps = DesiredCapabilities.CHROME
caps['goog:loggingPrefs'] = {'performance': 'ALL'}
driver = webdriver.Chrome(desired_capabilities=caps)
driver.get(url)
# Get network logs
logs = driver.get_log('performance')
api_calls = []
for entry in logs:
log = json.loads(entry['message'])['message']
if log['method'] == 'Network.responseReceived':
url = log['params']['response']['url']
if '/api/' in url or url.endswith('.json'):
api_calls.append(url)
driver.quit()
return list(set(api_calls))
# Choose best method based on data requirements
def choose_extraction_method(requirements):
if requirements.get('real_time', False):
return 'api' # APIs typically provide real-time data
elif requirements.get('large_volume', False):
return 'api' # APIs are more efficient for bulk data
elif requirements.get('structured_data', False):
return 'api' # APIs provide structured data
elif requirements.get('visual_content', False):
return 'html' # HTML needed for images, styling info
else:
return 'hybrid' # Use both as needed
Works with REST APIs, GraphQL, and modern web applications.
Share this article
Add Comment
No comments yet. Be the first to comment!