From: https://github.com/ksatola
Version: 0.1.0
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../src')
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARMA
import matplotlib.pyplot as plt
%matplotlib inline
from model import (
get_pm25_data_for_modelling,
get_best_arima_params_for_time_series,
get_df_for_lags_columns,
split_df_for_ts_modelling_offset,
predict_ar
)
from measure import (
get_rmse,
walk_forward_ts_model_validation,
get_mean_folds_rmse_for_n_prediction_points,
prepare_data_for_visualization
)
from plot import (
visualize_results
)
from utils import (
get_datetime_identifier
)
from logger import logger
model_name = 'AR'
Autoregression modeling is a modeling technique used for time series data that assumes linear continuation of the series so that previous values in the time series can be used to predict futures values.
Autoregression technique is similar to linear regression where, you’re taking all of the previous data points to build a model to predict a future data point using a simple linear model. With the autoregression model, your’e using previous data points and using them to predict future data point(s) but with multiple lag variables.
dfh = get_pm25_data_for_modelling('ts', 'h')
dfh.head()
df = dfh.copy()
# Define first past/future cutoff point in time offset (1 year of data)
cut_off_offset = 365*24 # for hourly data
#cut_off_offset = 365 # for daily data
# Predict for X points
n_pred_points = 24 # for hourly data
#n_pred_points = 7 # for daily data
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
period = 'H' # for hourly data
#period = 'D' # for daily data
# Create train / test datasets (with the offset of cut_off_offset datapoints from the end)
df_train, df_test = split_df_for_ts_modelling_offset(data=df, cut_off_offset=cut_off_offset, period=period)
In statistical time series models, fitting the model means estimating its paraneters. In case of AR model, the only parameter to estimate is number of autocorrelated lags.
%%time
# Train the model
model = AR(df_train)
model_fitted = model.fit()
In the above, we are simply creating a testing and training dataset and then creating and fitting our AR() model. The AR() function tries to estimate the number of lags for the prediction. Once you’ve fit the model, you can look at the chosen lag and parameters of the model using some simple print statements.
model_fitted
print(f'The lag value chose is: {model_fitted.k_ar}')
print(f'The coefficients of the model are:\n {model_fitted.params}')
# Evaluate model quality
import statsmodels.api as sm
res = model_fitted.resid
fig,ax = plt.subplots(2,1,figsize=(15,8))
fig = sm.graphics.tsa.plot_acf(res, lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(res, lags=50, ax=ax[1])
plt.show();
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
fold_results = walk_forward_ts_model_validation(data=df,
col_name='pm25',
model_params=model_fitted.params[:],
cut_off_offset=cut_off_offset,
n_pred_points=n_pred_points,
n_folds=-1)
print(len(fold_results))
print(fold_results[0])
8760
observed predicted error abs_error
Datetime
2018-01-01 01:00:00 84.90085 18.256931 66.643919 66.643919
2018-01-01 02:00:00 67.44355 15.665441 51.778109 51.778109
2018-01-01 03:00:00 76.66860 15.485031 61.183569 61.183569
2018-01-01 04:00:00 64.96090 15.694880 49.266020 49.266020
2018-01-01 05:00:00 64.14875 17.793727 46.355023 46.355023
2018-01-01 06:00:00 76.06410 19.353774 56.710326 56.710326
2018-01-01 07:00:00 69.19180 20.815613 48.376187 48.376187
2018-01-01 08:00:00 48.51735 20.968488 27.548862 27.548862
2018-01-01 09:00:00 45.92715 20.423024 25.504126 25.504126
2018-01-01 10:00:00 44.19595 18.709182 25.486768 25.486768
2018-01-01 11:00:00 39.27865 17.533684 21.744966 21.744966
2018-01-01 12:00:00 32.61625 16.494254 16.121996 16.121996
2018-01-01 13:00:00 34.09440 16.915910 17.178490 17.178490
2018-01-01 14:00:00 33.51795 17.853081 15.664869 15.664869
2018-01-01 15:00:00 41.24420 19.380832 21.863368 21.863368
2018-01-01 16:00:00 49.08765 21.370328 27.717322 27.717322
2018-01-01 17:00:00 51.24645 24.365030 26.881420 26.881420
2018-01-01 18:00:00 41.64520 27.020634 14.624566 14.624566
2018-01-01 19:00:00 40.98405 29.396926 11.587124 11.587124
2018-01-01 20:00:00 45.36865 30.724681 14.643969 14.643969
2018-01-01 21:00:00 58.24830 31.317142 26.931158 26.931158
2018-01-01 22:00:00 63.21335 30.628366 32.584984 32.584984
2018-01-01 23:00:00 78.28435 29.582203 48.702147 48.702147
2018-01-02 00:00:00 91.30400 27.710736 63.593264 63.593264
CPU times: user 20min 52s, sys: 5.08 s, total: 20min 57s
Wall time: 21min 1s
from joblib import dump, load
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
path = f'results/pm25_ts_{model_name}_results_h_{timestamp}.joblib'
dump(fold_results, path)
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
print(res)
[4.668825755494505, 6.710075125915751, 8.288377506868132, 9.491991895604396, 10.453700629578753, 11.235358768315018, 11.870429819139193, 12.321835634157507, 12.740681295787546, 13.05485232371795, 13.327539445970695, 13.544799782509157, 13.689832623626375, 13.799327701465204, 13.90105190018315, 13.993651717032966, 14.069570936355312, 14.149946726190477, 14.224708722527474, 14.267825125915751, 14.329140716575091, 14.41586407967033, 14.508073282967032, 14.631110141941392]
# Show forecasts for n-th point in the future
show_n_points_of_forecasts = [1, 12, 24] # for hourly data
#show_n_points_of_forecasts = [1, 3, 7] # for daily data
# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-02-16'), ('2018-06-01', '2018-06-16')]
# for daily data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]
# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]
# File names for plots (format png will be used, do not add .png extension)
base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
#base_file_path = f'images/pm25_obs_vs_pred_365_d_ts_{model_name}' # for daily data
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
start_end_dates=start_end_dates,
plot_types=plot_types,
base_file_path=base_file_path,
fold_results=fold_results,
n_pred_points=n_pred_points,
cut_off_offset=cut_off_offset,
model_name=model_name,
timestamp=timestamp)
dfd = get_pm25_data_for_modelling('ts', 'd')
dfd.head()
df = dfd.copy()
# Define first past/future cutoff point in time offset (1 year of data)
#cut_off_offset = 365*24 # for hourly data
cut_off_offset = 365 # for daily data
# Predict for X points
#n_pred_points = 24 # for hourly data
n_pred_points = 7 # for daily data
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
#period = 'H' # for hourly data
period = 'D' # for daily data
# Create train / test datasets (with the offset of cut_off_offset datapoints from the end)
df_train, df_test = split_df_for_ts_modelling_offset(data=df, cut_off_offset=cut_off_offset, period=period)
%%time
# Train the model
model = AR(df_train)
model_fitted = model.fit()
model_fitted
print(f'The lag value chose is: {model_fitted.k_ar}')
print(f'The coefficients of the model are:\n {model_fitted.params}')
# Evaluate model quality
import statsmodels.api as sm
res = model_fitted.resid
fig,ax = plt.subplots(2,1,figsize=(15,8))
fig = sm.graphics.tsa.plot_acf(res, lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(res, lags=50, ax=ax[1])
plt.show();
%%time
# Validate result on test
# Creates 365*60*24 models for hourly data, or 365*7 models for hourly data
fold_results = walk_forward_ts_model_validation(data=df,
col_name='pm25',
model_params=model_fitted.params[:],
cut_off_offset=cut_off_offset,
n_pred_points=n_pred_points,
n_folds=-1)
print(len(fold_results))
print(fold_results[0])
365
observed predicted error abs_error
Datetime
2018-01-02 67.991848 49.379772 18.612076 18.612076
2018-01-03 16.026950 46.310684 30.283734 30.283734
2018-01-04 14.590020 42.722729 28.132708 28.132708
2018-01-05 22.094854 36.818753 14.723899 14.723899
2018-01-06 62.504217 39.552410 22.951806 22.951806
2018-01-07 43.929804 44.885109 0.955304 0.955304
2018-01-08 22.088192 47.602331 25.514139 25.514139
CPU times: user 7.94 s, sys: 47 ms, total: 7.98 s
Wall time: 8 s
from joblib import dump, load
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
path = f'results/pm25_ts_{model_name}_results_d_{timestamp}.joblib'
dump(fold_results, path)
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
print(res)
[9.407572067039105, 12.302882402234635, 12.889160614525139, 13.248969832402235, 13.538120949720671, 13.71186312849162, 13.83969469273743]
# Show forecasts for n-th point in the future
#show_n_points_of_forecasts = [1, 12, 24] # for hourly data
show_n_points_of_forecasts = [1, 3, 7] # for daily data
# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-02-16'), ('2018-06-01', '2018-06-16')]
# for daily data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]
# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]
# File names for plots (format png will be used, do not add .png extension)
#base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
base_file_path = f'images/pm25_obs_vs_pred_365_d_ts_{model_name}' # for daily data
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
start_end_dates=start_end_dates,
plot_types=plot_types,
base_file_path=base_file_path,
fold_results=fold_results,
n_pred_points=n_pred_points,
cut_off_offset=cut_off_offset,
model_name=model_name,
timestamp=timestamp)