From: https://github.com/ksatola
Version: 0.1.0
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../src')
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator
from model import (
get_pm25_data_for_modelling,
get_models_for_regression,
split_df_for_ml_modelling_offset,
perform_grid_search_cv,
perform_random_search_cv
)
from measure import (
get_mean_folds_rmse_for_n_prediction_points,
score_ml_models,
prepare_data_for_visualization,
walk_forward_ml_model_validation
)
from plot import (
visualize_results
)
from utils import (
get_datetime_identifier
)
from logger import logger
from datetime import datetime
XXXXXXXXX
Napisac z scikit learn wstep do regresji
wypisac funkcje - zob. ponizej
Hyper based on CV and training data Final check based on test data - TODO
w pracy opisac generalne podejscie do ML, proste, dla najlepszych hyper, i dla nich super learner
XXXX From https://www.degruyter.com/view/journals/sagmb/6/1/article-sagmb.2007.6.1.1309.xml.xml
score models na zbiorze testowym! zrobic final test wybranych modeli ML na zbiorze testowym i policzyc RMSE -> symulacja predykcji jesli linear regression jest najlepsze to sprawdzic OLS assumptions dla zbioru danych
dfh = get_pm25_data_for_modelling('ml', 'h')
dfh.head()
df = dfh[:]
df.shape
df.head()
# Define first past/future cutoff point in time offset (1 year of data)
cut_off_offset = 365*24 # for hourly data
#cut_off_offset = 365 # for daily data
# For this method to work properly, observations in the data frame must be ordered by time (the greater index, the recent data)
# Train test split
X_train, X_test, y_train, y_test = split_df_for_ml_modelling_offset(data=df, target_col='t', cut_off_offset=cut_off_offset)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# Define linear and non-linear regression models in scope
reg_models = get_models_for_regression()
models = []
for model in reg_models:
item = (type(model).__name__, model)
models.append(item)
print(item)
%%time
# Perform initial ranking - choose best performing models for validation
# We assume, models will predict the best t+1, for subsequent ts the error will grow. That is why, for the initial ranking we will use one day prediction only
scores, results, names = score_ml_models(X_train=X_train,
y_train=y_train,
models=models,
n_splits = 5,
metric='neg_root_mean_squared_error',
metric_label="RMSE",
seed=123)
for score in scores:
print(score)
LinearRegression, RMSE 8.454518896700154, (std. dev. 1.0553480373018875)
ElasticNet, RMSE 8.466827600091218, (std. dev. 1.067885147340825)
SVR, RMSE 11.326321281986267, (std. dev. 3.2873896157758)
DecisionTreeRegressor, RMSE 11.935730744140105, (std. dev. 1.212603561790327)
KNeighborsRegressor, RMSE 10.866634500393763, (std. dev. 1.608640951710884)
AdaBoostRegressor, RMSE 16.822330057803956, (std. dev. 2.3545417407828704)
BaggingRegressor, RMSE 8.826528266269595, (std. dev. 1.1106023504757678)
RandomForestRegressor, RMSE 8.805528004207913, (std. dev. 1.118664501467662)
ExtraTreesRegressor, RMSE 8.818717598661783, (std. dev. 1.1022531965880644)
CPU times: user 19min 39s, sys: 4 s, total: 19min 43s
Wall time: 19min 40s
# Compare Algorithms
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Algorithms Comparison - RMSE')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('images/ml_comparison_rmse_h.png')
plt.show();
Hyper-parameters
are parameters that are not directly learnt within estimators. It is possible and recommended to search the hyper-parameter space for the best cross validation score.
While using a grid of parameter settings is currently the most widely used method for parameter optimization (GridSearchCV
), other search methods have more favourable properties. RandomizedSearchCV
implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:
Specifying how parameters should be sampled is done using a dictionary, very similar to specifying parameters for GridSearchCV
. Additionally, a computation budget, being the number of sampled candidates or sampling iterations, is specified using the n_iter parameter. For each parameter, either a distribution over possible values or a list of discrete choices (which will be sampled uniformly) can be specified: loguniform(1, 100) can be used instead of [1, 10, 100].
https://towardsdatascience.com/hyperparameter-tuning-c5619e7e6624
https://scikit-learn.org/stable/modules/grid_search.html#grid-search
Hyper-parameters are parameters that are not directly learnt within estimators. In scikit-learn they are passed as arguments to the constructor of the estimator classes. Typical examples include C, kernel and gamma for Support Vector Classifier, alpha for Lasso, etc.
Some models allow for specialized, efficient parameter search strategies, outlined below. Two generic approaches to sampling search candidates are provided in scikit-learn: for given values, GridSearchCV exhaustively considers all parameter combinations, while RandomizedSearchCV can sample a given number of candidates from a parameter space with a specified distribution.
Note that it is common that a small subset of those parameters can have a large impact on the predictive or computation performance of the model while others can be left to their default values. It is recommended to read the docstring of the estimator class to get a finer understanding of their expected behavior, possibly by reading the enclosed reference to the literature.
While using a grid of parameter settings is currently the most widely used method for parameter optimization, other search methods have more favourable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:
A budget can be chosen independent of the number of parameters and possible values.
Adding parameters that do not influence the performance does not decrease efficiency.
https://scikit-learn.org/stable/modules/cross_validation.html
Learning the parameters of a prediction function and testing it on the same data is a methodological mistake: a model that would just repeat the labels of the samples that it has just seen would have a perfect score but would fail to predict anything useful on yet-unseen data. This situation is called overfitting. To avoid it, it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test.
When evaluating different settings (“hyperparameters”) for estimators, such as the C setting that must be manually set for an SVM, there is still a risk of overfitting on the test set because the parameters can be tweaked until the estimator performs optimally. This way, knowledge about the test set can “leak” into the model and evaluation metrics no longer report on generalization performance. To solve this problem, yet another part of the dataset can be held out as a so-called “validation set”: training proceeds on the training set, after which evaluation is done on the validation set, and when the experiment seems to be successful, final evaluation can be done on the test set.
However, by partitioning the available data into three sets, we drastically reduce the number of samples which can be used for learning the model, and the results can depend on a particular random choice for the pair of (train, validation) sets.
A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:
A model is trained using of the folds as training data;
the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).
The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as is the case when fixing an arbitrary validation set), which is a major advantage in problems such as inverse inference where the number of samples is very small.
dac obrazek ze strony, zrobic source: sklearn
https://scikit-learn.org/stable/modules/compose.html
Transformers are usually combined with classifiers, regressors or other estimators to build a composite estimator. The most common tool is a Pipeline.
Pipeline can be used to chain multiple estimators into one. This is useful as there is often a fixed sequence of steps in processing the data, for example feature selection, normalization and classification. Pipeline serves multiple purposes here:
Convenience and encapsulation You only have to call fit and predict once on your data to fit a whole sequence of estimators.
Joint parameter selection You can grid search over parameters of all estimators in the pipeline at once.
Safety Pipelines help avoid leaking statistics from your test data into the trained model in cross-validation, by ensuring that the same samples are used to train the transformers and predictors.
All estimators in a pipeline, except the last one, must be transformers (i.e. must have a transform method). The last estimator may be any type (transformer, classifier, etc.).
https://scikit-learn.org/stable/modules/model_persistence.html
After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. The following section gives you an example of how to persist a model with pickle.
pickle (and joblib by extension), has some issues regarding maintainability and security. Because of this,
Never unpickle untrusted data as it could lead to malicious code being executed upon loading.
While models saved using one version of scikit-learn might load in other versions, this is entirely unsupported and inadvisable. It should also be kept in mind that operations performed on such data could give different and unexpected results.
In order to rebuild a similar model with future versions of scikit-learn, additional metadata should be saved along the pickled model:
The training data, e.g. a reference to an immutable snapshot
The python source code used to generate the model
The versions of scikit-learn and its dependencies
The cross validation score obtained on the training data
This should make it possible to check that the cross-validation score is in the same range as before.
Since a model internal representation may be different on two different architectures, dumping a model on one architecture and loading it on another architecture is not supported.
more: https://pyvideo.org/pycon-us-2014/pickles-are-for-delis-not-software.html
Based on the best median RMSE metric results, the following algorithms seem to work best with PM2.5 data (cutoff < 8.5 g/m3) and the best non-linear algorithm:
After hyper-parameter tuning, they will be validated using a common procedure.
models[0][0], models[0][1].get_params().keys()
%%time
# LinearRegression
param_grid = {
"fit_intercept": [True, False],
"normalize": [True, False],
"n_jobs": [-1]
}
model = models[0][1]
%%time
perform_grid_search_cv(X_train=X_train,
y_train=y_train,
model=model,
param_grid=param_grid,
scoring='neg_root_mean_squared_error',
num_folds=6,
seed=123)
regression.py | 70 | perform_grid_search_cv | 12-Jun-20 22:36:53 | INFO: Best: -8.417609654057864 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': True}
CPU times: user 5.01 s, sys: 1.09 s, total: 6.1 s
Wall time: 925 ms
models[1][0], models[1][1].get_params().keys()
%%time
# ElasticNet
param_grid = {
"alpha": [0.0001, 0.001, 0.01, 0.1, 1],
"l1_ratio": np.arange(0.0, 1.0, 0.1),
"max_iter": [1, 10, 100, 500, 1000],
"fit_intercept": [True, False],
"selection": ["cyclic", "random"]
}
model = models[1][1]
%%time
perform_grid_search_cv(X_train=X_train,
y_train=y_train,
model=model,
param_grid=param_grid,
scoring='neg_root_mean_squared_error',
num_folds=6,
seed=123)
regression.py | 70 | perform_grid_search_cv | 12-Jun-20 23:03:26 | INFO: Best: -8.420425074217084 using {'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}
CPU times: user 1h 40min 6s, sys: 13min 37s, total: 1h 53min 44s
Wall time: 23min 35s
models[7][0], models[7][1].get_params().keys()
%%time
# RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset
# and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples
# parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
"criterion": ['mse', 'mae'],
#"min_samples_split": [2, 8, 16],
#"min_samples_leaf": [1, 2, 4],
"n_estimators": [100, 200],
"n_jobs": [-1]
}
model = models[7][1]
# Limit training data to 2 last years of hourly data (for performance reasons)
X_train_limited = X_train[-365*24*2:]
y_train_limited = y_train[-365*24*2:]
X_train_limited.shape, y_train_limited.shape
%%time
# Random search, to speed-up
perform_random_search_cv(X_train=X_train_limited,
y_train=y_train_limited,
model=model,
param_grid=param_grid,
scoring='neg_root_mean_squared_error',
num_folds=6,
seed=123)
For entire dataset, random grid search was running for more than 6 hours - no results. Stoped and limited training data set to 2 years of observations (17520, comparing to 87618). X_train_limited, y_train_limited 23:30 started with full training dataset 6:15 interrupted manually - no visible progress 6:23 started with limited 10:41 interrupted manually - no visible progress (limited and limited parameters grid) -> I will use standard settings 10:42 started with limited historical data and limited grid params 11:15 decision: do not tune RandomForestRegressor hyper-parameters, use it with default settings
best_models = [
LinearRegression({'fit_intercept': True, 'n_jobs': -1, 'normalize': True}),
ElasticNet(**{'alpha': 0.001, 'fit_intercept': True, 'l1_ratio': 0.9, 'max_iter': 500, 'selection': 'random'}),
RandomForestRegressor()]
models = []
for model in best_models:
item = (type(model).__name__, model)
models.append(item)
print(item)
('LinearRegression', LinearRegression(copy_X=True,
fit_intercept={'fit_intercept': True, 'n_jobs': -1,
'normalize': True},
n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.9,
max_iter=500, normalize=False, positive=False, precompute=False,
random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None, oob_score=False,
random_state=None, verbose=0, warm_start=False))
# number of t-n columns in a dataset
n_lags_in_dataset = 10 # for hourly data
#n_lags_in_dataset = 4 # for daily data
# Set datetime format for index
dt_format = "%Y-%m-%d %H:%M:%S" # for hourly data
#dt_format = "%Y-%m-%d" # for daily data
# Create train and validate sets
train_test_split_position = int(len(df)-cut_off_offset)
# Create as many folds as remains till the end of known data
n_folds = len(df) #train_test_split_position+3
# Predict for X points
n_pred_points = 24 # for hourly data
#n_pred_points = 7 # for daily data
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
# Model 1: LinearRegression
model_name = models[0][0]
model = models[0][1]
# Model 2: ElasticNet
#model_name = models[1][0]
#model = models[1][1]
# Model 3: RandomForestRegressor
#model_name = models[7][0]
#model = models[7][1]
fold_results = walk_forward_ml_model_validation(data=df,
model=model,
target_col='t',
cut_off_offset=cut_off_offset,
n_pred_points=n_pred_points,
n_folds=-1,
n_lags = n_lags_in_dataset)
print(len(fold_results))
print(fold_results[0])
from joblib import dump, load
model_name = models[0][0]
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
path = f'results/pm25_ml_{model_name}_results_h_{timestamp}.joblib'
dump(fold_results, path)
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
print(res)
# Show forecasts for n-th point in the future
show_n_points_of_forecasts = [1, 12, 24] # for hourly data
#show_n_points_of_forecasts = [1, 3, 7] # for daily data
# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2018-07-01')]
# for daily data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]
# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]
# File names for plots (format png will be used, do not add .png extension)
base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
#base_file_path = f'images/pm25_obs_vs_pred_365_d_ml_{model_name}' # for daily data
fold_results[0].index
# We need to convert to datetime index for plotting
# Replace integer-based index with datetime-based index
# We remove n_lags_in_dataset, b/c when the dataset with t-n lags was created
# rows conatining NaNs were removed
df_datetime = get_pm25_data_for_modelling('ts', 'd')[n_lags_in_dataset:]
#df_datetime.head()
#df_datetime[0:1].index.strftime(dt_format)[0]
# Integer to datetime index using a mapper (ts version of the dataset)
for i in range(0, len(fold_results)):
if not isinstance(fold_results[i].index, pd.DatetimeIndex):
# Create a table of datetime indexes corresponding to integer indexes
datetime_indexes = []
integer_indexes = fold_results[i].index
for ix in integer_indexes:
datetime_indexes.append(df_datetime[ix:ix+1].index.strftime(dt_format)[0])
print(datetime_indexes)
# Apply datetime_indexes as index for fold data
fold_results[i].index = pd.to_datetime(datetime_indexes)
fold_results[0].index
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
start_end_dates=start_end_dates,
plot_types=plot_types,
base_file_path=base_file_path,
fold_results=fold_results,
n_pred_points=n_pred_points,
cut_off_offset=cut_off_offset,
model_name=model_name,
timestamp=timestamp)
%%time
#LinearRegression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
# fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation
param_grid = {
"alpha": [0.0001, 0.001, 0.01, 0.1, 1],
"l1_ratio": np.arange(0.0, 1.0, 0.1),
"max_iter": [1, 10, 100, 1000],
"fit_intercept": [True, False],
"selection": ["cyclic", "random"]
}
model = models[1][1]
%%time
#ElasticNet
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
# Linear regression with combined L1 and L2 priors as regularizer.
param_grid = {
"alpha": [0.0001, 0.001, 0.01, 0.1, 1],
"l1_ratio": np.arange(0.0, 1.0, 0.1),
"max_iter": [1, 10, 100, 1000],
"fit_intercept": [True, False],
"selection": ["cyclic", "random"]
}
model = models[1][1]
# Excluded
%%time
#DecisionTreeRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
# Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
param_grid = {
"criterion": ["mse", "friedman_mse", "mae"],
"splitter": ["best", "random"],
"max_depth": range(2, 16, 2),
"min_samples_split": range(2, 16, 2),
"min_samples_leaf": range(2, 16, 2),
"max_features": ["auto", "sqrt", "log2"],
"ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10, 100, 1000]
}
%%time
#BaggingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://www.programcreek.com/python/example/85938/sklearn.ensemble.BaggingRegressor
# A Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.
# The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.
param_grid = {
"base_estimator": [models[0][1]],
"n_estimators": [100, 200, 500, 1000, 5000, 10000],
"bootstrap": [True, False],
"n_jobs": [-1]
}
%%time
#RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
"n_estimators": [100, 200, 500],#, 1000, 5000, 10000],
#"criterion": ["mse", "mae"],
#"bootstrap": [True, False],
"ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10],#, 100, 1000],
"max_depth": [10, 20, 30, 40, None],#50, 60, 70, 80, 90, 100, None],
"max_features": ["auto", "sqrt", "log2"],
"min_samples_leaf": [1, 2, 4],
#"min_samples_split": range(2, 16, 2),
"n_jobs": [-1]
}
model = models[7][1]
%%time
#ExtraTreesRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
# https://www.programcreek.com/python/example/102434/sklearn.ensemble.ExtraTreesRegressor
# This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
param_grid = {
"n_estimators": [100, 200, 500],#, 1000, 5000, 10000],
#"criterion": ["mse", "mae"],
#"bootstrap": [True, False],
"ccp_alpha": [0.001, 0.01, 0.1, 0, 1, 10],#, 100, 1000],
"max_depth": [10, 20, 30, 40, None],#50, 60, 70, 80, 90, 100, None],
#"max_features": ["auto", "sqrt", "log2"],
"min_samples_leaf": [1, 2, 4],
#"min_samples_split": range(2, 16, 2),
"n_jobs": [-1]
}
model = models[8][1]
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
dfd = get_pm25_data_for_modelling('ml', 'd')
dfd.head()
df = dfd.copy()
df.head()
# Define first past/future cutoff point in time offset (1 year of data)
#cut_off_offset = 365*24 # for hourly data
cut_off_offset = 365 # for daily data
# For this method to work properly, observations in the data frame must be ordered by time (the greater index, the recent data)
# Train test split
X_train, X_test, y_train, y_test = split_df_for_ml_modelling_offset(data=df, target_col='t', cut_off_offset=cut_off_offset)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# Define linear and non-linear regression models in scope
reg_models = get_models_for_regression()
models = []
for model in reg_models:
item = (type(model).__name__, model)
models.append(item)
print(item)
%%time
# Perform initial ranking - choose best performing models for validation
# We assume, models will predict the best t+1, for subsequent ts the error will grow. That is why, for the initial ranking we will use one day prediction only
scores, results, names = score_ml_models(X_train=X_train,
y_train=y_train,
models=models,
n_splits = 5,
metric='neg_root_mean_squared_error',
metric_label="RMSE",
seed=123)
for score in scores:
print(score)
LinearRegression, RMSE 21.71519880662688, (std. dev. 2.9273799166199206)
ElasticNet, RMSE 21.830784202119624, (std. dev. 2.9573842361855087)
SVR, RMSE 24.045745977510276, (std. dev. 4.441156440786244)
DecisionTreeRegressor, RMSE 31.147068453565886, (std. dev. 2.809053790566411)
KNeighborsRegressor, RMSE 23.707382569743043, (std. dev. 3.3448167988995414)
AdaBoostRegressor, RMSE 27.586419082358997, (std. dev. 3.418986721465014)
BaggingRegressor, RMSE 23.431554189892402, (std. dev. 3.5302630679381473)
RandomForestRegressor, RMSE 23.217160647978343, (std. dev. 3.3641116644360025)
ExtraTreesRegressor, RMSE 24.202871671021512, (std. dev. 3.2927586658114762)
CPU times: user 4 s, sys: 120 ms, total: 4.12 s
Wall time: 3.71 s
# Compare Algorithms
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Algorithms Comparison - RMSE')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('images/ml_comparison_rmse_d.png')
plt.show();
Based on the best median RMSE metric results, the following algorithms seem to work best with PM2.5 data (cutoff < 22 g/m3) and the best non-linear algorithm:
After hyper-parameter tuning, they will be validated using a common procedure.
models[0][0], models[0][1].get_params().keys()
%%time
# LinearRegression
param_grid = {
"fit_intercept": [True, False],
"normalize": [True, False],
"n_jobs": [-1]
}
model = models[0][1]
%%time
perform_grid_search_cv(X_train=X_train,
y_train=y_train,
model=model,
param_grid=param_grid,
scoring='neg_root_mean_squared_error',
num_folds=6,
seed=123)
regression.py | 70 | perform_grid_search_cv | 13-Jun-20 11:34:01 | INFO: Best: -21.68400804843603 using {'fit_intercept': True, 'n_jobs': -1, 'normalize': False}
CPU times: user 215 ms, sys: 23.2 ms, total: 238 ms
Wall time: 83.3 ms
models[1][0], models[1][1].get_params().keys()
%%time
# ElasticNet
param_grid = {
"alpha": [0.0001, 0.001, 0.01, 0.1, 1],
"l1_ratio": np.arange(0.0, 1.0, 0.1),
"max_iter": [1, 10, 100, 500, 1000],
"fit_intercept": [True, False],
"selection": ["cyclic", "random"]
}
model = models[1][1]
%%time
perform_grid_search_cv(X_train=X_train,
y_train=y_train,
model=model,
param_grid=param_grid,
scoring='neg_root_mean_squared_error',
num_folds=6,
seed=123)
regression.py | 70 | perform_grid_search_cv | 13-Jun-20 11:40:34 | INFO: Best: -21.688890846091354 using {'alpha': 0.0001, 'fit_intercept': True, 'l1_ratio': 0.4, 'max_iter': 10, 'selection': 'random'}
CPU times: user 3min 8s, sys: 20.4 s, total: 3min 28s
Wall time: 52.8 s
models[7][0], models[7][1].get_params().keys()
%%time
#RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset
# and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples
# parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
param_grid = {
"criterion": ['mse', 'mae'],
"min_samples_split": [2, 8, 16],
"min_samples_leaf": [1, 2, 4],
"n_estimators": [100, 200],
"n_jobs": [-1]
}
model = models[7][1]
%%time
# Random, to speed-up
#RandomForestRegressor - Random Search
perform_random_search_cv(X_train=X_train,
y_train=y_train,
model=model,
param_grid=param_grid,
scoring='neg_root_mean_squared_error',
num_folds=6,
seed=123)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 12.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 14.2min finished
regression.py | 107 | perform_random_search_cv | 13-Jun-20 11:58:16 | INFO: Best: -21.81259741714167 using {'n_jobs': -1, 'n_estimators': 200, 'min_samples_split': 16, 'min_samples_leaf': 4, 'criterion': 'mae'}
CPU times: user 2min 20s, sys: 342 ms, total: 2min 20s
Wall time: 14min 27s
best_models = [
LinearRegression({'fit_intercept': True, 'n_jobs': -1, 'normalize': False}),
ElasticNet(**{'alpha': 0.0001, 'fit_intercept': True, 'l1_ratio': 0.4, 'max_iter': 10, 'selection': 'random'}),
RandomForestRegressor(**{'n_jobs': -1, 'n_estimators': 200, 'min_samples_split': 16, 'min_samples_leaf': 4, 'criterion': 'mae'}),
]
models = []
for model in best_models:
item = (type(model).__name__, model)
models.append(item)
print(item)
('LinearRegression', LinearRegression(copy_X=True,
fit_intercept={'fit_intercept': True, 'n_jobs': -1,
'normalize': False},
n_jobs=None, normalize=False))
('ElasticNet', ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=0.4,
max_iter=10, normalize=False, positive=False, precompute=False,
random_state=None, selection='random', tol=0.0001, warm_start=False))
('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=4,
min_samples_split=16, min_weight_fraction_leaf=0.0,
n_estimators=200, n_jobs=-1, oob_score=False,
random_state=None, verbose=0, warm_start=False))
# number of t-n columns in a dataset
#n_lags_in_dataset = 10 # for hourly data
n_lags_in_dataset = 4 # for daily data
# Set datetime format for index
#dt_format = "%Y-%m-%d %H:%M:%S" # for hourly data
dt_format = "%Y-%m-%d" # for daily data
# Create train and validate sets
train_test_split_position = int(len(df)-cut_off_offset)
# Create as many folds as remains till the end of known data
n_folds = len(df) #train_test_split_position+3
# Predict for X points
#n_pred_points = 24 # for hourly data
n_pred_points = 7 # for daily data
%%time
# Validate result on test
# Creates 365*24*24 models for hourly data, or 365*7 models for hourly data
# Model 1: LinearRegression
#model_name = models[0][0]
#model = models[0][1]
# Model 2: ElasticNet
#model_name = models[1][0]
#model = models[1][1]
# Model 3: RandomForestRegressor
model_name = models[2][0]
model = models[2][1]
fold_results = walk_forward_ml_model_validation(data=df,
model=model,
target_col='t',
cut_off_offset=cut_off_offset,
n_pred_points=n_pred_points,
n_folds=-1,
n_lags = n_lags_in_dataset)
print(len(fold_results))
print(fold_results[0])
# Model 3: RandomForestRegressor
validation.py | 88 | walk_forward_ml_model_validation | 13-Jun-20 12:13:52 | INFO: ('RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=4,
min_samples_split=16, min_weight_fraction_leaf=0.0,
n_estimators=200, n_jobs=-1, oob_score=False,
random_state=None, verbose=0, warm_start=False)) model validation started
Started fold 000000/000365 - 2020-06-13_12-13-52
Started fold 000010/000365 - 2020-06-13_12-32-38
Started fold 000020/000365 - 2020-06-13_12-51-02
Started fold 000030/000365 - 2020-06-13_13-10-25
Started fold 000040/000365 - 2020-06-13_13-29-00
Started fold 000050/000365 - 2020-06-13_13-47-26
Started fold 000060/000365 - 2020-06-13_14-05-20
Started fold 000070/000365 - 2020-06-13_14-23-52
Started fold 000080/000365 - 2020-06-13_14-42-27
Started fold 000090/000365 - 2020-06-13_15-01-26
Started fold 000100/000365 - 2020-06-13_15-20-27
Started fold 000110/000365 - 2020-06-13_15-42-08
Started fold 000120/000365 - 2020-06-13_16-04-21
Started fold 000130/000365 - 2020-06-13_16-26-12
Started fold 000140/000365 - 2020-06-13_16-50-06
Started fold 000150/000365 - 2020-06-13_17-13-50
Started fold 000160/000365 - 2020-06-13_17-37-07
Started fold 000170/000365 - 2020-06-13_18-00-26
Started fold 000180/000365 - 2020-06-13_18-24-28
Started fold 000190/000365 - 2020-06-13_18-48-49
Started fold 000200/000365 - 2020-06-13_19-12-33
Started fold 000210/000365 - 2020-06-13_19-36-20
Started fold 000220/000365 - 2020-06-13_20-00-22
Started fold 000230/000365 - 2020-06-13_20-24-16
Started fold 000240/000365 - 2020-06-13_20-48-14
Started fold 000250/000365 - 2020-06-13_21-11-32
Started fold 000260/000365 - 2020-06-13_21-36-11
Started fold 000270/000365 - 2020-06-13_21-59-56
Started fold 000280/000365 - 2020-06-13_22-21-27
Started fold 000290/000365 - 2020-06-13_22-45-13
Started fold 000300/000365 - 2020-06-13_23-09-11
Started fold 000310/000365 - 2020-06-13_23-32-40
Started fold 000320/000365 - 2020-06-13_23-56-09
Started fold 000330/000365 - 2020-06-14_00-19-26
Started fold 000340/000365 - 2020-06-14_00-40-22
Started fold 000350/000365 - 2020-06-14_00-59-43
358
observed predicted error abs_error
3650 67.991848 75.416027 7.424179 7.424179
3651 16.026950 74.369728 58.342778 58.342778
3652 14.590020 26.050605 11.460584 11.460584
3653 22.094854 25.651497 3.556643 3.556643
3654 62.504217 32.889676 29.614540 29.614540
3655 43.929804 57.558177 13.628373 13.628373
3656 22.088192 61.956002 39.867810 39.867810
CPU times: user 3d 21h 17min 45s, sys: 13min 46s, total: 3d 21h 31min 32s
Wall time: 13h 1min 19s
# Model 2: ElasticNet
358
observed predicted error abs_error
3650 67.991848 52.050065 15.941783 15.941783
3651 16.026950 60.887953 44.861003 44.861003
3652 14.590020 21.886631 7.296611 7.296611
3653 22.094854 24.978848 2.883993 2.883993
3654 62.504217 29.182693 33.321524 33.321524
3655 43.929804 56.137409 12.207605 12.207605
3656 22.088192 42.861250 20.773058 20.773058
CPU times: user 1min 3s, sys: 6.68 s, total: 1min 9s
Wall time: 17.6 s
# Model 1: LinearRegression
358
observed predicted error abs_error
3650 67.991848 56.841447 11.150400 11.150400
3651 16.026950 61.386935 45.359985 45.359985
3652 14.590020 23.431795 8.841774 8.841774
3653 22.094854 31.352299 9.257445 9.257445
3654 62.504217 33.263056 29.241160 29.241160
3655 43.929804 59.087193 15.157389 15.157389
3656 22.088192 43.298562 21.210370 21.210370
CPU times: user 44.7 s, sys: 4.41 s, total: 49.2 s
Wall time: 18.1 s
from joblib import dump, load
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
path = f'results/pm25_ml_{model_name}_results_d_{timestamp}.joblib'
dump(fold_results, path)
fold_results = load(path)
print(len(fold_results))
print(fold_results[0])
%%time
# Returns a list of mean folds RMSE for n_pred_points (starting at 1 point forecast)
res = get_mean_folds_rmse_for_n_prediction_points(fold_results=fold_results, n_pred_points=n_pred_points)
res
print(res)
# Model 3: RandomForestRegressor
[9.130201994301995, 9.262425925925927, 9.13762849002849, 9.134805698005698, 9.326830769230769, 9.281815099715098, 9.317902849002849]
# Model 2: ElasticNet
[9.90203247863248, 10.054816239316239, 9.830852706552706, 9.85534358974359, 10.01102792022792, 9.93521396011396, 10.049378347578347]
# Model 1: LinearRegression
[9.462926210826211, 9.488959544159545, 9.400442735042732, 9.458610541310541, 9.551721652421653, 9.50755527065527, 9.564808262108263]
# Show forecasts for n-th point in the future
#show_n_points_of_forecasts = [1, 12, 24] # for hourly data
show_n_points_of_forecasts = [1, 3, 7] # for daily data
# Used to zoom the plots (date ranges shown in the plots)
# for hourly data
#start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-03-01'), ('2018-06-01', '2018-07-01')]
# for daily data
start_end_dates = [('2018-01-01', '2019-01-01'), ('2018-02-01', '2018-04-01'), ('2018-06-01', '2018-08-01')]
# Type of plot
# 0 -> plot_observed_vs_predicted
# 1 -> plot_observed_vs_predicted_with_error
plot_types = [0, 1, 1]
# File names for plots (format png will be used, do not add .png extension)
#base_file_path = f'images/pm25_obs_vs_pred_365_h_ts_{model_name}' # for hourly data
base_file_path = f'images/pm25_obs_vs_pred_365_d_ml_{model_name}' # for daily data
fold_results[0].index
# We need to convert to datetime index for plotting
# Replace integer-based index with datetime-based index
# We remove n_lags_in_dataset, b/c when the dataset with t-n lags was created
# rows conatining NaNs were removed
df_datetime = get_pm25_data_for_modelling('ts', 'd')[n_lags_in_dataset:]
#df_datetime.head()
#df_datetime[0:1].index.strftime(dt_format)[0]
# Integer to datetime index using a mapper (ts version of the dataset)
for i in range(0, len(fold_results)):
if not isinstance(fold_results[i].index, pd.DatetimeIndex):
# Create a table of datetime indexes corresponding to integer indexes
datetime_indexes = []
integer_indexes = fold_results[i].index
for ix in integer_indexes:
datetime_indexes.append(df_datetime[ix:ix+1].index.strftime(dt_format)[0])
#print(datetime_indexes)
# Apply datetime_indexes as index for fold data
fold_results[i].index = pd.to_datetime(datetime_indexes)
fold_results[0].index
visualize_results(show_n_points_of_forecasts=show_n_points_of_forecasts,
start_end_dates=start_end_dates,
plot_types=plot_types,
base_file_path=base_file_path,
fold_results=fold_results,
n_pred_points=n_pred_points,
cut_off_offset=cut_off_offset,
model_name=model_name,
timestamp=timestamp)
from joblib import dump, load
dump(model, 'model.joblib')
model2 = load('model.joblib')
model2.predict(X[0:20])
model[0]
from joblib import dump, load
timestamp = get_datetime_identifier("%Y-%m-%d_%H-%M-%S")
for model in models:
path = f'results/pm25_ml_{model[0]}_model_d_{timestamp}.joblib'
dump(model, path)
%%time
path = f'results/pm25_ml_RandomForestRegressor_model_d_2020-06-12_18-06-21.joblib'
model2 = load(path)
print(model2[0], model2[1])
model2_fitted = model2[1].fit(X_train, y_train)
print(model2_fitted.predict(X_test[0:1]))
print(y_test[0:1])
RandomForestRegressor RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=4,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=-1, oob_score=False,
random_state=None, verbose=0, warm_start=False)
[74.06882094]
3650 67.991848
Name: t, dtype: float64
CPU times: user 46.5 s, sys: 191 ms, total: 46.7 s
Wall time: 7.12 s