Web Scraping and API Integration

Most interesting data lives on the web, but it’s not always available in convenient CSV files. Learning to extract data from websites and APIs opens up vast sources of information for your analyses. The key is doing this responsibly and efficiently while respecting website terms of service and rate limits.

Web scraping and API integration require different approaches depending on the data source, but both follow similar patterns: make requests, parse responses, handle errors, and respect the server’s resources.

API Integration Fundamentals

APIs provide structured access to data and are generally preferred over web scraping when available. They’re more reliable, faster, and less likely to break when websites change their design.

import requests
import pandas as pd
import json
import time
from typing import Dict, List, Optional

class APIClient:
    """Generic API client with rate limiting and error handling."""
    
    def __init__(self, base_url: str, api_key: Optional[str] = None, 
                 rate_limit: float = 1.0):
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.rate_limit = rate_limit
        self.last_request_time = 0
        
    def _wait_for_rate_limit(self):
        """Ensure we don't exceed rate limits."""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.rate_limit:
            time.sleep(self.rate_limit - elapsed)
        self.last_request_time = time.time()
    
    def make_request(self, endpoint: str, params: Dict = None) -> Dict:
        """Make API request with error handling."""
        self._wait_for_rate_limit()
        
        url = f"{self.base_url}/{endpoint.lstrip('/')}"
        headers = {}
        
        if self.api_key:
            headers['Authorization'] = f'Bearer {self.api_key}'
        
        try:
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()
            return response.json()
            
        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}")
            return {}

# Example: Weather data API (using a hypothetical service)
def fetch_weather_data(cities: List[str]) -> pd.DataFrame:
    """Fetch weather data for multiple cities."""
    
    # This is a mock example - replace with actual API
    weather_data = []
    
    for city in cities:
        # Simulate API response
        mock_data = {
            'city': city,
            'temperature': 20 + hash(city) % 20,  # Mock temperature
            'humidity': 40 + hash(city) % 40,     # Mock humidity
            'timestamp': pd.Timestamp.now()
        }
        weather_data.append(mock_data)
        
        # Simulate rate limiting
        time.sleep(0.1)
    
    return pd.DataFrame(weather_data)

# Fetch data for multiple cities
cities = ['New York', 'London', 'Tokyo', 'Sydney']
weather_df = fetch_weather_data(cities)
print("Weather Data:")
print(weather_df)

Web Scraping with BeautifulSoup

When APIs aren’t available, web scraping extracts data directly from HTML pages. BeautifulSoup makes parsing HTML straightforward, but you need to handle dynamic content and respect website policies.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin, urlparse

class WebScraper:
    """Web scraper with polite crawling practices."""
    
    def __init__(self, delay: float = 1.0):
        self.delay = delay
        self.session = requests.Session()
        # Set a user agent to identify your scraper
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (compatible; DataScienceScraper/1.0)'
        })
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """Fetch and parse a web page."""
        try:
            time.sleep(self.delay)  # Be polite
            response = self.session.get(url)
            response.raise_for_status()
            
            return BeautifulSoup(response.content, 'html.parser')
            
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return None
    
    def extract_table_data(self, soup: BeautifulSoup, 
                          table_selector: str) -> pd.DataFrame:
        """Extract data from HTML tables."""
        table = soup.select_one(table_selector)
        if not table:
            return pd.DataFrame()
        
        # Extract headers
        headers = []
        header_row = table.select_one('thead tr, tr:first-child')
        if header_row:
            headers = [th.get_text(strip=True) for th in 
                      header_row.select('th, td')]
        
        # Extract data rows
        rows = []
        data_rows = table.select('tbody tr, tr')[1:] if headers else table.select('tr')
        
        for row in data_rows:
            cells = [td.get_text(strip=True) for td in row.select('td, th')]
            if cells:  # Skip empty rows
                rows.append(cells)
        
        # Create DataFrame
        if headers and rows:
            return pd.DataFrame(rows, columns=headers)
        elif rows:
            return pd.DataFrame(rows)
        else:
            return pd.DataFrame()

# Example: Scraping a hypothetical data table
def scrape_sample_data():
    """Demonstrate web scraping techniques."""
    
    # Create mock HTML content for demonstration
    mock_html = """
    <html>
    <body>
        <table id="data-table">
            <thead>
                <tr>
                    <th>Product</th>
                    <th>Price</th>
                    <th>Rating</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Laptop A</td>
                    <td>$999</td>
                    <td>4.5</td>
                </tr>
                <tr>
                    <td>Laptop B</td>
                    <td>$1299</td>
                    <td>4.2</td>
                </tr>
            </tbody>
        </table>
    </body>
    </html>
    """
    
    soup = BeautifulSoup(mock_html, 'html.parser')
    scraper = WebScraper()
    
    # Extract table data
    df = scraper.extract_table_data(soup, '#data-table')
    print("Scraped Data:")
    print(df)
    
    return df

scraped_df = scrape_sample_data()

Handling Dynamic Content with Selenium

Modern websites often load content dynamically with JavaScript. Selenium automates a real browser to handle these scenarios, though it’s slower than direct HTTP requests.

# Note: Requires 'pip install selenium' and appropriate webdriver
try:
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.chrome.options import Options
    
    def setup_driver(headless: bool = True):
        """Set up Chrome driver with options."""
        options = Options()
        if headless:
            options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        # Note: Requires chromedriver in PATH
        return webdriver.Chrome(options=options)
    
    def scrape_dynamic_content(url: str) -> List[Dict]:
        """Scrape content that loads dynamically."""
        driver = setup_driver()
        
        try:
            driver.get(url)
            
            # Wait for content to load
            wait = WebDriverWait(driver, 10)
            
            # Example: Wait for specific elements to appear
            elements = wait.until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "data-item"))
            )
            
            # Extract data from elements
            data = []
            for element in elements:
                item_data = {
                    'text': element.text,
                    'href': element.get_attribute('href'),
                    'class': element.get_attribute('class')
                }
                data.append(item_data)
            
            return data
            
        finally:
            driver.quit()
    
    print("Selenium setup available for dynamic content scraping")
    
except ImportError:
    print("Selenium not installed - skipping dynamic content scraping")

Data Pipeline for Web Data

Collecting web data is just the first step. Building robust pipelines ensures data quality and handles the inevitable changes in source websites or APIs.

import sqlite3
from datetime import datetime, timedelta
import logging

class DataPipeline:
    """Pipeline for collecting, processing, and storing web data."""
    
    def __init__(self, db_path: str = 'web_data.db'):
        self.db_path = db_path
        self.setup_database()
        self.setup_logging()
    
    def setup_database(self):
        """Initialize database tables."""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute('''
                CREATE TABLE IF NOT EXISTS scraped_data (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    source TEXT NOT NULL,
                    data_type TEXT NOT NULL,
                    content TEXT NOT NULL,
                    scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    processed BOOLEAN DEFAULT FALSE
                )
            ''')
    
    def setup_logging(self):
        """Configure logging for pipeline monitoring."""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('scraping.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def collect_data(self, sources: List[Dict]):
        """Collect data from multiple sources."""
        for source in sources:
            try:
                self.logger.info(f"Collecting data from {source['name']}")
                
                if source['type'] == 'api':
                    data = self.collect_api_data(source)
                elif source['type'] == 'scrape':
                    data = self.collect_scraped_data(source)
                else:
                    self.logger.warning(f"Unknown source type: {source['type']}")
                    continue
                
                self.store_data(source['name'], source['type'], data)
                
            except Exception as e:
                self.logger.error(f"Failed to collect from {source['name']}: {e}")
    
    def collect_api_data(self, source: Dict) -> str:
        """Collect data from API source."""
        # Mock API data collection
        return json.dumps({
            'timestamp': datetime.now().isoformat(),
            'source': source['name'],
            'data': f"Mock API data from {source['url']}"
        })
    
    def collect_scraped_data(self, source: Dict) -> str:
        """Collect data from web scraping."""
        # Mock scraping data collection
        return json.dumps({
            'timestamp': datetime.now().isoformat(),
            'source': source['name'],
            'data': f"Mock scraped data from {source['url']}"
        })
    
    def store_data(self, source: str, data_type: str, content: str):
        """Store collected data in database."""
        with sqlite3.connect(self.db_path) as conn:
            conn.execute(
                'INSERT INTO scraped_data (source, data_type, content) VALUES (?, ?, ?)',
                (source, data_type, content)
            )
    
    def process_stored_data(self) -> pd.DataFrame:
        """Process and clean stored data."""
        with sqlite3.connect(self.db_path) as conn:
            df = pd.read_sql_query(
                'SELECT * FROM scraped_data WHERE processed = FALSE',
                conn
            )
        
        # Process data (clean, transform, validate)
        processed_data = []
        for _, row in df.iterrows():
            try:
                content = json.loads(row['content'])
                processed_item = {
                    'id': row['id'],
                    'source': row['source'],
                    'timestamp': content['timestamp'],
                    'processed_at': datetime.now().isoformat()
                }
                processed_data.append(processed_item)
                
            except json.JSONDecodeError:
                self.logger.warning(f"Invalid JSON in row {row['id']}")
        
        # Mark as processed
        if processed_data:
            with sqlite3.connect(self.db_path) as conn:
                ids = [item['id'] for item in processed_data]
                placeholders = ','.join(['?'] * len(ids))
                conn.execute(
                    f'UPDATE scraped_data SET processed = TRUE WHERE id IN ({placeholders})',
                    ids
                )
        
        return pd.DataFrame(processed_data)

# Example usage
pipeline = DataPipeline()

# Define data sources
sources = [
    {
        'name': 'weather_api',
        'type': 'api',
        'url': 'https://api.weather.com/v1/current'
    },
    {
        'name': 'news_site',
        'type': 'scrape',
        'url': 'https://example-news.com/headlines'
    }
]

# Collect and process data
pipeline.collect_data(sources)
processed_df = pipeline.process_stored_data()

print("Processed data:")
print(processed_df)

Best Practices and Ethics

Web scraping and API usage come with responsibilities. Always check robots.txt files, respect rate limits, and consider the impact of your requests on server resources. Many websites provide APIs specifically to avoid the need for scraping.

Cache responses when possible to avoid repeated requests for the same data. Monitor your scrapers for failures and implement retry logic with exponential backoff. Document your data sources and collection methods for reproducibility.

Most importantly, respect copyright and terms of service. Just because data is publicly visible doesn’t mean it’s free to use for any purpose. When in doubt, contact the website owner or look for official data sharing agreements.

In our next part, we’ll explore database integration and SQL for data science, learning how to work with large datasets that don’t fit in memory and how to perform analysis directly in databases.