From: https://github.com/ksatola
Version: 0.1.0
We will prepare the data for:
The data will be adjusted for the following kinds of forecasts:
We will then save these new dataframes.
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../src')
import pandas as pd
import numpy as np
from model import (
load_data,
calculate_season,
build_datetime_features,
get_df_for_lags_columns
)
from plot import (
plot_ts_corr,
plot_stl
)
from stats import (
adfuller_test
)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
pd.set_option('precision', 5)
data_path = 'data/'
data_file = data_path + 'dfpm2008_2018.csv'
df = load_data(data_file)
df.index
# Convert index to datetime with minutes frequency
df.index = pd.to_datetime(df.index)
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
df.asfreq('T').index
df.head()
# We will only need PM2.5 for modelling
df.drop(columns=['pm10'], axis='columns', inplace=True) # axis=1
df.head()
data_file_hdf = data_path + 'dfpm25_2008-2018_hourly.hdf'
df.to_hdf(data_file_hdf, key='df', mode='w')
df = pd.read_hdf(path_or_buf=data_file_hdf, key="df")
print(f'Dataframe size: {df.shape}')
df.head()
# Resample data to daily using mean of values
df_daily = df.resample(rule='D').mean() # daily
df_daily.head()
data_file_hdf = data_path + 'dfpm25_2008-2018_daily.hdf'
df_daily.to_hdf(data_file_hdf, key='df', mode='w')
df = pd.read_hdf(path_or_buf=data_file_hdf, key="df")
print(f'Dataframe size: {df.shape}')
df.head()
Feature engineering
is the process of transforming raw data into features that better represent the underlying problem to the predictive models, resulting in improved model accuracy on unseen data. For machine learning models, instead of dealing with datetime index, we will create additional features based on time
and include them in the analytical view. In addition, we will perform time series data decomposition
(remove trend and seasonality) and check ACF for most correlated lags to be chosen as features. In the final prediction mechanism, the predicted values on stationary component can be transform to the original values by adding removed earlier trend and seasonality components.
df = load_data(data_file)
# We will only need PM2.5 for ML modelling
df.drop(columns=['pm10'], axis='columns', inplace=True) # axis=1
df.head()
# Create 10 hours of lag values to predict current observation (correlation coefficient > 0.5)
df24h = get_df_for_lags_columns(data=df, col_name='pm25', n_lags=10, remove_nans=True)
df24h
corr = df24h.corr(method='pearson')
corr[0:1]
# Remove Datetime index and calculate date-related features from it
df24h = build_datetime_features(df24h, 'Datetime')
df24h.head()
data_file_hdf = data_path + 'dfpm25_2008-2018_ml_24hours_lags.hdf'
df24h.to_hdf(data_file_hdf, key='df', mode='w')
df24h = pd.read_hdf(path_or_buf=data_file_hdf, key="df")
print(f'Dataframe size: {df.shape}')
df24h.head()
# Convert index to datetime with minutes frequency
df.index = pd.to_datetime(df.index)
# Resample data to daily using mean of values
df_daily = df[['pm25']].resample(rule='D').mean() # daily
df_daily.head()
# Create 10 days of lag values to see correlation coefficients
df7d = get_df_for_lags_columns(data=df_daily, col_name='pm25', n_lags=10, remove_nans=True)
df7d
corr = df7d.corr(method='pearson')
corr[0:1]
# Create 3 days of lag values to predict current observation (correlation coefficient > 0.4)
df7d = get_df_for_lags_columns(data=df_daily, col_name='pm25', n_lags=4, remove_nans=True)
df7d
# Remove Datetime index and calculate date-related features from it
df7d = build_datetime_features(df7d, 'Datetime')
df7d.head()
data_file_hdf = data_path + 'dfpm25_2008-2018_ml_7days_lags.hdf'
df7d.to_hdf(data_file_hdf, key='df', mode='w')
df7d = pd.read_hdf(path_or_buf=data_file_hdf, key="df")
print(f'Dataframe size: {df7d.shape}')
df7d.head()