JBON_DATA

Web Scraping for Business Intelligence

Web scraping automates the collection of publicly available data from websites. When done ethically and legally, it's a powerful tool for competitive intelligence, market research, and data enrichment.

Ethical Considerations

  • Respect robots.txt directives
  • Don't overload servers
  • Comply with terms of service
  • Don't scrape personal data without consent
  • Check legal requirements in your jurisdiction

The Python Stack

import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

class Scraper:
    def __init__(self, base_url, delay=1):
        self.base_url = base_url
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (compatible; DataBot/1.0)'
        })
    
    def get_page(self, url):
        """Fetch page with rate limiting."""
        time.sleep(self.delay)
        response = self.session.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    
    def extract_data(self, soup):
        """Override in subclass."""
        raise NotImplementedError

Handling Dynamic Content

from playwright.sync_api import sync_playwright

def scrape_dynamic_page(url):
    """For JavaScript-rendered content."""
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        
        # Wait for content to load
        page.wait_for_selector('.product-list')
        
        # Extract data
        products = page.query_selector_all('.product')
        data = []
        for product in products:
            data.append({
                'name': product.query_selector('.name').text_content(),
                'price': product.query_selector('.price').text_content()
            })
        
        browser.close()
        return data

Data Pipeline Integration

import pandas as pd
from datetime import datetime

def scraping_pipeline():
    """Full scraping workflow."""
    
    # 1. Collect data
    scraper = ProductScraper(BASE_URL)
    raw_data = scraper.scrape_all_pages()
    
    # 2. Clean and transform
    df = pd.DataFrame(raw_data)
    df['price'] = df['price'].str.replace('$', '').astype(float)
    df['scrape_date'] = datetime.now()
    
    # 3. Store
    df.to_parquet(f'data/products_{datetime.now():%Y%m%d}.parquet')
    
    # 4. Incremental updates
    update_database(df)
    
    return df

Error Handling

from tenacity import retry, stop_after_attempt, wait_exponential

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10)
)
def fetch_with_retry(url):
    """Retry with exponential backoff."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    return response

def robust_scrape(urls):
    """Handle failures gracefully."""
    results = []
    failures = []
    
    for url in urls:
        try:
            data = fetch_with_retry(url)
            results.append(parse_data(data))
        except Exception as e:
            failures.append({'url': url, 'error': str(e)})
            continue
    
    log_failures(failures)
    return results

Common Use Cases

  • Price monitoring: Track competitor pricing
  • Lead generation: Collect business directories
  • Market research: Product availability and reviews
  • News aggregation: Industry news monitoring

Web scraping is a means to an end. Focus on the business value of the data you're collecting.

← Back to Blog