From: https://github.com/ksatola
Version: 0.1.0
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../src')
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
%matplotlib inline
from model import (
get_pm25_data_for_modelling,
get_best_arima_params_for_time_series,
split_df_for_ts_modelling_offset,
predict_ts,
fit_model
)
from measure import (
get_rmse,
walk_forward_ts_model_validation2,
get_mean_folds_rmse_for_n_prediction_points,
prepare_data_for_visualization
)
from plot import (
visualize_results,
plot_ts_corr
)
from utils import (
get_datetime_identifier
)
from logger import logger
model_name = 'MA'
The moving average model (MA)
, is a linear regression model of the lag residual errors.
dfh = get_pm25_data_for_modelling('ts', 'h')
dfh.head()
df = dfh.copy()
# Define first past/future cutoff point in time offset (1 year of data)
cut_off_offset = 365*24 # for hourly data
#cut_off_offset = 365 # for daily data
# Predict for X points
n_pred_points = 24 # for hourly data
#n_pred_points = 7 # for daily data
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
period = 'H' # for hourly data
#period = 'D' # for daily data
# Create train / test datasets (with the offset of cut_off_offset datapoints from the end)
# period=None because we need index to be DatetimeIndex not PeriodIndex for SARIMAX
df_train, df_test = split_df_for_ts_modelling_offset(data=df, cut_off_offset=cut_off_offset, period=None)
In statistical time series models, fitting the model means estimating its paraneters. In case of AR model, the only parameter to estimate is number of autocorrelated lags.
plot_ts_corr(df_train['pm25'])
%%time
# Find best parameters (grid-search)
best_results = get_best_arima_params_for_time_series(data=df_train,
seasonal=False,
max_param_range_p=0,
max_param_range_d=0,
max_param_range_q=5) # MA model
SARIMAX(0, 0, 0) - AIC:950035.119919657
SARIMAX(0, 0, 1) - AIC:843344.0396508672
SARIMAX(0, 0, 2) - AIC:772685.05307077
SARIMAX(0, 0, 3) - AIC:726435.4649042541
SARIMAX(0, 0, 4) - AIC:699267.5669061132
SARIMAX(0, 0, 5) - AIC:679220.6801305263
Best model is ARIMA(0, 0, 5) with AIC of 679220.6801305263
CPU times: user 3min 39s, sys: 24.4 s, total: 4min 4s
Wall time: 1min 7s
%%time
# Train the model -> find best parameters
p = 0 # (AR)
d = 0 # differencing
q = 5 # (MA)
model = SARIMAX(endog=df_train, order=(p, d, q))
model_fitted = model.fit()
model_fitted
# Estimated parameters
print(model_fitted.summary())
# True parameters
print(f'The coefficients of the model are:\n {model_fitted.params}')
print(f'The residual errors during training of the model are:\n {model_fitted.resid}')
# Evaluate model quality
import statsmodels.api as sm
res = model_fitted.resid
fig,ax = plt.subplots(2,1,figsize=(15,8))
fig = sm.graphics.tsa.plot_acf(res, lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(res, lags=50, ax=ax[1])
plt.show();
# Evaluate model quality
model_fitted.plot_diagnostics(figsize=(20, 10))
plt.show();
MA model alone does not fit good for hourly data as it does not capture the multilayer seasonality (yearly, weekly, daily) in data. Increasing MA order does not help either.
Making a prediction requires that we retrieve the MA coefficients from the fit model and use them with the lag of residual error values and call the custom predict() function.
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
fold_results = walk_forward_ts_model_validation2(data=df,
col_name='pm25',
model_type='MA',
p=p,
d=d,
q=q,
cut_off_offset=cut_off_offset,
n_pred_points=n_pred_points,
n_folds=-1,
period=period)
print(len(fold_results))
print(fold_results[0])
from joblib import dump, load
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
path = f'results/pm25_ts_{model_name}_results_h_{timestamp}.joblib'
#dump(fold_results, path)
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
print(res)
# Show forecasts for n-th point in the future
show_n_points_of_forecasts = [1, 12, 24] # for hourly data
#show_n_points_of_forecasts = [1, 3, 7] # for daily data
# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2018-07-01')]
# for daily data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]
# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]
# File names for plots (format png will be used, do not add .png extension)
base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
#base_file_path = f'images/pm25_obs_vs_pred_365_d_ts_{model_name}' # for daily data
fold_results[0].index
# We need to convert to datetime index for plotting
# https://stackoverflow.com/questions/29394730/converting-periodindex-to-datetimeindex
for i in range(0, len(fold_results)):
if not isinstance(fold_results[i].index, pd.DatetimeIndex):
fold_results[i].index = fold_results[i].index.to_timestamp()
fold_results[0].index
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
start_end_dates=start_end_dates,
plot_types=plot_types,
base_file_path=base_file_path,
fold_results=fold_results,
n_pred_points=n_pred_points,
cut_off_offset=cut_off_offset,
model_name=model_name,
timestamp=timestamp)
dfd = get_pm25_data_for_modelling('ts', 'd')
dfd.head()
df = dfd.copy()
# Define first past/future cutoff point in time offset (1 year of data)
#cut_off_offset = 365*24 # for hourly data
cut_off_offset = 365 # for daily data
# Predict for X points
#n_pred_points = 24 # for hourly data
n_pred_points = 7 # for daily data
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
#period = 'H' # for hourly data
period = 'D' # for daily data
# Create train / test datasets (with the offset of cut_off_offset datapoints from the end)
# period=None because we need index to be DatetimeIndex not PeriodIndex for SARIMAX
df_train, df_test = split_df_for_ts_modelling_offset(data=df, cut_off_offset=cut_off_offset, period=None)
plot_ts_corr(df_train['pm25'])
%%time
# Find best parameters (grid-search)
best_results = get_best_arima_params_for_time_series(data=df_train,
seasonal=False,
max_param_range_p=0,
max_param_range_d=0,
max_param_range_q=5) # MA model
SARIMAX(0, 0, 0) - AIC:38983.00813026588
SARIMAX(0, 0, 1) - AIC:36059.690659735024
SARIMAX(0, 0, 2) - AIC:34918.45008233713
SARIMAX(0, 0, 3) - AIC:34409.81571434598
SARIMAX(0, 0, 4) - AIC:34096.090358520945
SARIMAX(0, 0, 5) - AIC:33892.384328226864
Best model is ARIMA(0, 0, 5) with AIC of 33892.384328226864
CPU times: user 7.08 s, sys: 1.28 s, total: 8.36 s
Wall time: 2.07 s
%%time
# Train the model -> find best parameters
p = 0 # (AR)
d = 0 # differencing
q = 5 # (MA)
model = SARIMAX(endog=df_train['pm25'], order=(p, d, q))
model_fitted = model.fit()
model_fitted
# Estimated parameters
print(model_fitted.summary())
# True parameters
print(f'The coefficients of the model are:\n {model_fitted.params}')
print(f'The residual errors during training of the model are:\n {model_fitted.resid}')
# Evaluate model quality
import statsmodels.api as sm
res = model_fitted.resid
fig,ax = plt.subplots(2,1,figsize=(15,8))
fig = sm.graphics.tsa.plot_acf(res, lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(res, lags=50, ax=ax[1])
plt.show();
# Evaluate model quality
model_fitted.plot_diagnostics(figsize=(20, 10))
plt.show();
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
fold_results = walk_forward_ts_model_validation2(data=df,
col_name='pm25',
model_type='MA',
p=p,
d=d,
q=q,
cut_off_offset=cut_off_offset,
n_pred_points=n_pred_points,
n_folds=-1,
period=period)
print(len(fold_results))
print(fold_results[0])
validation.py | 67 | walk_forward_ts_model_validation2 | 10-Jun-20 13:13:44 | INFO: (0, 0, 5)
validation.py | 71 | walk_forward_ts_model_validation2 | 10-Jun-20 13:13:44 | INFO: MA model validation started
Started fold 000000/000365 - 2020-06-10_13-13-44
Started fold 000010/000365 - 2020-06-10_13-14-45
Started fold 000020/000365 - 2020-06-10_13-15-50
Started fold 000030/000365 - 2020-06-10_13-16-47
Started fold 000040/000365 - 2020-06-10_13-17-49
Started fold 000050/000365 - 2020-06-10_13-18-45
Started fold 000060/000365 - 2020-06-10_13-19-47
Started fold 000070/000365 - 2020-06-10_13-20-41
Started fold 000080/000365 - 2020-06-10_13-21-38
Started fold 000090/000365 - 2020-06-10_13-22-34
Started fold 000100/000365 - 2020-06-10_13-23-33
Started fold 000110/000365 - 2020-06-10_13-24-30
Started fold 000120/000365 - 2020-06-10_13-25-41
Started fold 000130/000365 - 2020-06-10_13-26-43
Started fold 000140/000365 - 2020-06-10_13-27-43
Started fold 000150/000365 - 2020-06-10_13-28-42
Started fold 000160/000365 - 2020-06-10_13-29-40
Started fold 000170/000365 - 2020-06-10_13-30-38
Started fold 000180/000365 - 2020-06-10_13-31-30
Started fold 000190/000365 - 2020-06-10_13-32-24
Started fold 000200/000365 - 2020-06-10_13-33-22
Started fold 000210/000365 - 2020-06-10_13-34-21
Started fold 000220/000365 - 2020-06-10_13-35-16
Started fold 000230/000365 - 2020-06-10_13-36-15
Started fold 000240/000365 - 2020-06-10_13-37-10
Started fold 000250/000365 - 2020-06-10_13-38-05
Started fold 000260/000365 - 2020-06-10_13-39-02
Started fold 000270/000365 - 2020-06-10_13-40-02
Started fold 000280/000365 - 2020-06-10_13-41-05
Started fold 000290/000365 - 2020-06-10_13-42-07
Started fold 000300/000365 - 2020-06-10_13-43-06
Started fold 000310/000365 - 2020-06-10_13-43-59
Started fold 000320/000365 - 2020-06-10_13-44-55
Started fold 000330/000365 - 2020-06-10_13-45-49
Started fold 000340/000365 - 2020-06-10_13-46-46
Started fold 000350/000365 - 2020-06-10_13-47-46
Started fold 000360/000365 - 2020-06-10_13-48-41
365
observed predicted error abs_error
Datetime
2018-01-02 67.991848 54.709948 13.281900 13.281900
2018-01-03 16.026950 43.506800 27.479850 27.479850
2018-01-04 14.590020 32.143511 17.553490 17.553490
2018-01-05 22.094854 21.183389 0.911465 0.911465
2018-01-06 62.504217 10.350621 52.153596 52.153596
2018-01-07 43.929804 0.000209 43.929595 43.929595
2018-01-08 22.088192 0.000101 22.088090 22.088090
CPU times: user 2h 5min 53s, sys: 21min 12s, total: 2h 27min 6s
Wall time: 35min 10s
from joblib import dump, load
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
path = f'results/pm25_ts_{model_name}_results_d_{timestamp}.joblib'
dump(fold_results, path)
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
print(res)
[11.755884916201119, 18.54064776536313, 22.45893463687151, 26.14217932960894, 29.310016480446926, 30.748041340782123, 30.660515642458105]
# Show forecasts for n-th point in the future
#show_n_points_of_forecasts = [1, 12, 24] # for hourly data
show_n_points_of_forecasts = [1, 3, 7] # for daily data
# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2019-07-01')]
# for daily data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2019-08-01')]
# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]
# File names for plots (format png will be used, do not add .png extension)
#base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
base_file_path = f'images/pm25_obs_vs_pred_365_d_ts_{model_name}' # for daily data
fold_results[0].index
# We need to convert to datetime index for plotting
# https://stackoverflow.com/questions/29394730/converting-periodindex-to-datetimeindex
for i in range(0, len(fold_results)):
if not isinstance(fold_results[i].index, pd.DatetimeIndex):
fold_results[i].index = fold_results[i].index.to_timestamp()
fold_results[0].index
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
start_end_dates=start_end_dates,
plot_types=plot_types,
base_file_path=base_file_path,
fold_results=fold_results,
n_pred_points=n_pred_points,
cut_off_offset=cut_off_offset,
model_name=model_name,
timestamp=timestamp)