Time Series Analysis and Forecasting
Time series data has unique characteristics that standard machine learning approaches often miss. Temporal dependencies, seasonality, and trends require specialized techniques that account for the sequential nature of observations. Ignoring these patterns leads to poor forecasts and misleading insights.
The challenge with time series isn’t just predicting future values—it’s understanding the underlying patterns that drive change over time and distinguishing signal from noise in temporal data.
Understanding Time Series Components
Every time series can be decomposed into trend, seasonality, and residual components. Understanding these elements helps you choose appropriate modeling approaches and interpret results correctly.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')
# Create synthetic time series with known components
np.random.seed(42)
dates = pd.date_range('2020-01-01', periods=365*3, freq='D')
# Trend component
trend = np.linspace(100, 200, len(dates))
# Seasonal component (yearly and weekly patterns)
yearly_season = 20 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)
weekly_season = 5 * np.sin(2 * np.pi * np.arange(len(dates)) / 7)
# Random noise
noise = np.random.normal(0, 10, len(dates))
# Combine components
ts_data = trend + yearly_season + weekly_season + noise
# Create time series DataFrame
ts_df = pd.DataFrame({
'date': dates,
'value': ts_data
})
ts_df.set_index('date', inplace=True)
# Decompose time series
decomposition = seasonal_decompose(ts_df['value'], model='additive', period=365)
# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(12, 10))
ts_df['value'].plot(ax=axes[0], title='Original Time Series')
decomposition.trend.plot(ax=axes[1], title='Trend')
decomposition.seasonal.plot(ax=axes[2], title='Seasonal')
decomposition.resid.plot(ax=axes[3], title='Residual')
plt.tight_layout()
plt.show()
print("Time series components identified:")
print(f"Trend range: {decomposition.trend.min():.1f} to {decomposition.trend.max():.1f}")
print(f"Seasonal range: {decomposition.seasonal.min():.1f} to {decomposition.seasonal.max():.1f}")
Stationarity Testing and Transformation
Most time series models assume stationarity—constant mean and variance over time. Testing for stationarity and applying appropriate transformations is crucial for reliable forecasting.
def check_stationarity(timeseries, title):
"""Perform Augmented Dickey-Fuller test for stationarity."""
# Perform ADF test
result = adfuller(timeseries.dropna())
print(f'\n{title}:')
print(f'ADF Statistic: {result[0]:.6f}')
print(f'p-value: {result[1]:.6f}')
print(f'Critical Values:')
for key, value in result[4].items():
print(f'\t{key}: {value:.3f}')
if result[1] <= 0.05:
print("Series is stationary (reject null hypothesis)")
else:
print("Series is non-stationary (fail to reject null hypothesis)")
return result[1] <= 0.05
# Test original series
is_stationary = check_stationarity(ts_df['value'], "Original Series")
# Apply differencing if non-stationary
if not is_stationary:
ts_df['diff1'] = ts_df['value'].diff()
ts_df['diff2'] = ts_df['diff1'].diff()
# Test differenced series
check_stationarity(ts_df['diff1'], "First Difference")
check_stationarity(ts_df['diff2'], "Second Difference")
# Plot original vs differenced series
fig, axes = plt.subplots(3, 1, figsize=(12, 8))
ts_df['value'].plot(ax=axes[0], title='Original Series')
ts_df['diff1'].plot(ax=axes[1], title='First Difference')
ts_df['diff2'].plot(ax=axes[2], title='Second Difference')
plt.tight_layout()
plt.show()
ARIMA Modeling for Forecasting
ARIMA (AutoRegressive Integrated Moving Average) models capture temporal dependencies through autoregressive terms, differencing for stationarity, and moving average terms for error correction.
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Determine ARIMA parameters using ACF and PACF plots
fig, axes = plt.subplots(2, 1, figsize=(12, 6))
# Use stationary series for parameter selection
stationary_series = ts_df['diff1'].dropna()
plot_acf(stationary_series, ax=axes[0], lags=40)
plot_pacf(stationary_series, ax=axes[1], lags=40)
plt.tight_layout()
plt.show()
# Fit ARIMA model
# Using (1,1,1) as starting point - adjust based on ACF/PACF plots
model = ARIMA(ts_df['value'], order=(1,1,1))
fitted_model = model.fit()
print("ARIMA Model Summary:")
print(fitted_model.summary())
# Generate forecasts
forecast_steps = 30
forecast = fitted_model.forecast(steps=forecast_steps)
forecast_ci = fitted_model.get_forecast(steps=forecast_steps).conf_int()
# Create forecast dates
last_date = ts_df.index[-1]
forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
periods=forecast_steps, freq='D')
# Plot results
plt.figure(figsize=(12, 6))
# Plot last 100 days of actual data
ts_df['value'][-100:].plot(label='Actual', color='blue')
# Plot forecast
plt.plot(forecast_dates, forecast, label='Forecast', color='red')
plt.fill_between(forecast_dates,
forecast_ci.iloc[:, 0],
forecast_ci.iloc[:, 1],
color='red', alpha=0.3, label='Confidence Interval')
plt.legend()
plt.title('ARIMA Forecast')
plt.show()
Seasonal ARIMA (SARIMA) for Complex Patterns
When data exhibits seasonal patterns, SARIMA models extend ARIMA to handle both non-seasonal and seasonal components explicitly.
from statsmodels.tsa.statespace.sarimax import SARIMAX
# Fit SARIMA model with seasonal component
# (p,d,q) x (P,D,Q,s) where s is seasonal period
sarima_model = SARIMAX(ts_df['value'],
order=(1,1,1),
seasonal_order=(1,1,1,365))
fitted_sarima = sarima_model.fit(disp=False)
print("SARIMA Model Summary:")
print(fitted_sarima.summary())
# Generate SARIMA forecasts
sarima_forecast = fitted_sarima.forecast(steps=forecast_steps)
sarima_ci = fitted_sarima.get_forecast(steps=forecast_steps).conf_int()
# Compare ARIMA vs SARIMA forecasts
plt.figure(figsize=(12, 6))
ts_df['value'][-100:].plot(label='Actual', color='blue')
plt.plot(forecast_dates, forecast, label='ARIMA Forecast', color='red')
plt.plot(forecast_dates, sarima_forecast, label='SARIMA Forecast', color='green')
plt.legend()
plt.title('ARIMA vs SARIMA Forecasts')
plt.show()
Model Evaluation and Validation
Time series model evaluation requires special consideration for temporal dependencies. Use time-aware cross-validation and appropriate metrics for forecast accuracy.
from sklearn.metrics import mean_absolute_error, mean_squared_error
def time_series_cv_score(data, model_func, n_splits=5, test_size=30):
"""Time series cross-validation with expanding window."""
scores = []
total_size = len(data)
for i in range(n_splits):
# Expanding window: use more data for each iteration
train_size = total_size - (n_splits - i) * test_size
if train_size < 100: # Minimum training size
continue
train_data = data[:train_size]
test_data = data[train_size:train_size + test_size]
if len(test_data) < test_size:
test_data = data[train_size:]
# Fit model and forecast
model = model_func(train_data)
forecast = model.forecast(steps=len(test_data))
# Calculate error metrics
mae = mean_absolute_error(test_data, forecast)
rmse = np.sqrt(mean_squared_error(test_data, forecast))
scores.append({'mae': mae, 'rmse': rmse})
return scores
# Define model fitting functions
def fit_arima(data):
return ARIMA(data, order=(1,1,1)).fit(disp=False)
def fit_sarima(data):
return SARIMAX(data, order=(1,1,1), seasonal_order=(1,1,1,365)).fit(disp=False)
# Evaluate models
arima_scores = time_series_cv_score(ts_df['value'], fit_arima)
sarima_scores = time_series_cv_score(ts_df['value'], fit_sarima)
# Compare results
print("Cross-Validation Results:")
print("\nARIMA Model:")
arima_mae = np.mean([s['mae'] for s in arima_scores])
arima_rmse = np.mean([s['rmse'] for s in arima_scores])
print(f"Average MAE: {arima_mae:.3f}")
print(f"Average RMSE: {arima_rmse:.3f}")
print("\nSARIMA Model:")
sarima_mae = np.mean([s['mae'] for s in sarima_scores])
sarima_rmse = np.mean([s['rmse'] for s in sarima_scores])
print(f"Average MAE: {sarima_mae:.3f}")
print(f"Average RMSE: {sarima_rmse:.3f}")
Advanced Time Series Techniques
Modern time series analysis includes machine learning approaches that can capture complex non-linear patterns while handling multiple variables and external factors.
# Prophet for automatic seasonality detection
try:
from prophet import Prophet
# Prepare data for Prophet
prophet_df = ts_df.reset_index()
prophet_df.columns = ['ds', 'y']
# Fit Prophet model
prophet_model = Prophet(yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=False)
prophet_model.fit(prophet_df)
# Generate future dates and forecast
future = prophet_model.make_future_dataframe(periods=forecast_steps)
prophet_forecast = prophet_model.predict(future)
# Plot Prophet results
fig = prophet_model.plot(prophet_forecast)
plt.title('Prophet Forecast')
plt.show()
# Plot components
fig = prophet_model.plot_components(prophet_forecast)
plt.show()
print("Prophet model fitted successfully")
except ImportError:
print("Prophet not installed - skipping Prophet analysis")
Time series analysis requires understanding both statistical theory and domain knowledge about the processes generating your data. The key is matching your modeling approach to the specific characteristics of your time series—trend, seasonality, and noise patterns.
In our next part, we’ll explore web scraping and API integration, learning how to collect data from online sources and integrate external data into your analysis pipeline.