Web Scraping for Business Intelligence
Web scraping automates the collection of publicly available data from websites. When done ethically and legally, it's a powerful tool for competitive intelligence, market research, and data enrichment.
Ethical Considerations
- Respect robots.txt directives
- Don't overload servers
- Comply with terms of service
- Don't scrape personal data without consent
- Check legal requirements in your jurisdiction
The Python Stack
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
class Scraper:
def __init__(self, base_url, delay=1):
self.base_url = base_url
self.delay = delay
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)'
})
def get_page(self, url):
"""Fetch page with rate limiting."""
time.sleep(self.delay)
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
def extract_data(self, soup):
"""Override in subclass."""
raise NotImplementedError
Handling Dynamic Content
from playwright.sync_api import sync_playwright
def scrape_dynamic_page(url):
"""For JavaScript-rendered content."""
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
# Wait for content to load
page.wait_for_selector('.product-list')
# Extract data
products = page.query_selector_all('.product')
data = []
for product in products:
data.append({
'name': product.query_selector('.name').text_content(),
'price': product.query_selector('.price').text_content()
})
browser.close()
return data
Data Pipeline Integration
import pandas as pd
from datetime import datetime
def scraping_pipeline():
"""Full scraping workflow."""
# 1. Collect data
scraper = ProductScraper(BASE_URL)
raw_data = scraper.scrape_all_pages()
# 2. Clean and transform
df = pd.DataFrame(raw_data)
df['price'] = df['price'].str.replace('$', '').astype(float)
df['scrape_date'] = datetime.now()
# 3. Store
df.to_parquet(f'data/products_{datetime.now():%Y%m%d}.parquet')
# 4. Incremental updates
update_database(df)
return df
Error Handling
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10)
)
def fetch_with_retry(url):
"""Retry with exponential backoff."""
response = requests.get(url, timeout=30)
response.raise_for_status()
return response
def robust_scrape(urls):
"""Handle failures gracefully."""
results = []
failures = []
for url in urls:
try:
data = fetch_with_retry(url)
results.append(parse_data(data))
except Exception as e:
failures.append({'url': url, 'error': str(e)})
continue
log_failures(failures)
return results
Common Use Cases
- Price monitoring: Track competitor pricing
- Lead generation: Collect business directories
- Market research: Product availability and reviews
- News aggregation: Industry news monitoring
Web scraping is a means to an end. Focus on the business value of the data you're collecting.