Skip to content

0 baseline control

import sys
import numpy as np
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')

from data.make_dataset import DataLoad

Helper Functions

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def get_ml_stats(ypred, ytest, stat='r2', output='avg'):

    # each sample or average
    if output == 'avg':
        multioutput = 'uniform_average'
    elif output == 'raw':
        multioutput = 'raw_values'
    elif output == 'var':
        multioutput = 'variance_weighted'
    else:
        raise ValueError(f"Unrecognized output param: {output}")

    if stat == 'r2':
        res = r2_score(ytest, ypred, multioutput=multioutput)
    elif stat == 'mse':
        res = mean_squared_error(ytest, ypred, multioutput=multioutput)
    elif stat == 'mae':
        res = mean_absolute_error(ytest, ypred, multioutput=multioutput)
    elif stat == 'rmse':
        res = mean_squared_error(ytest, ypred, multioutput=multioutput)
        res = np.sqrt(res)
    else:
        raise ValueError(f"Unrecognized stat param: {stat}")

    return res 


def print_stats(ypred, ytest, transformer=None):
    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)

    for istat in ['r2', 'mse', 'mae', 'rmse']:

        print(f"{istat.upper()}: {get_ml_stats(ypred, ytest, stat=istat, output='avg')}")
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_stat_levels(ypred, ytest, stat='r2', transformer=None):

    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)

    res = get_ml_stats(ypred, ytest, stat=stat, output='raw')

    fig, ax = plt.subplots()
    ax.plot(res)
    plt.show()
dataloader = DataLoad()

X, y = dataloader.load_control_data('na')

X = X[dataloader.core_vars]
y = np.log(y.drop(dataloader.meta_vars, axis=1))

Train Testin Split

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    X.values, y.values, train_size=0.8, random_state=123
)

Normalization

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# normalize X
x_scaler = StandardScaler()
xtrain_norm = x_scaler.fit_transform(xtrain)
xtest_norm = x_scaler.transform(xtest)

# decompose Y
n_components = 10
random_state = 123
# y_scaler = PCA(n_components=n_components, random_state=random_state)
y_scaler = StandardScaler(with_std=False)

ytrain_norm = y_scaler.fit_transform(ytrain)
ytest_norm = y_scaler.transform(ytest)

Baseline ML Models

Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
# lin_model = TransformedTargetRegressor(
#     LinearRegression(n_jobs=-1),
#     transformer=y_scaler,
#     check_inverse=False,
# )

lin_model = LinearRegression(n_jobs=-1)


lin_model.fit(xtrain_norm, ytrain_norm);
ypred = lin_model.predict(xtest_norm)

Validation Set

yvalid = lin_model.predict(xtrain_norm)

print_stats(yvalid, ytrain)
R2: -494.5810297553773
MSE: 61.37957974432133
MAE: 7.810819972645941
RMSE: 7.834512093571707
plot_stat_levels(yvalid, ytrain, stat='mae')
plot_stat_levels(yvalid, ytrain, stat='mse')
plot_stat_levels(yvalid, ytrain, stat='rmse')
plot_stat_levels(yvalid, ytrain, stat='r2')
plot_stat_levels(ypred, ytest, stat='mae', )
plot_stat_levels(ypred, ytest, stat='r2', )

Random Forest Regression

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
rf_clf = RandomForestRegressor(
    bootstrap=True, criterion='mse', max_depth=100,
    max_features=4, max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=4,
    min_weight_fraction_leaf=0.0, n_estimators=1000,
    n_jobs=-1, oob_score=False, random_state=None,
    verbose=0, warm_start=False
)

rf_clf.fit(xtrain_norm, ytrain)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
                      max_features=4, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=4,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

Validation Set

yvalid = rf_clf.predict(xtrain_norm)

print_stats(yvalid, ytrain)
R2: 0.8204574133134702
MSE: 0.03028096625860035
MAE: 0.12294711155916899
RMSE: 0.17401427027287258
plot_stat_levels(yvalid, ytrain, stat='mae')
plot_stat_levels(yvalid, ytrain, stat='mse')
plot_stat_levels(yvalid, ytrain, stat='rmse')
plot_stat_levels(yvalid, ytrain, stat='r2')

Test Set

ypred = rf_clf.predict(xtest_norm)
print_stats(ypred, ytest)
R2: 0.2538318989517747
MSE: 0.12075259233622311
MAE: 0.24865150289773952
RMSE: 0.34749473713456885
plot_stat_levels(ypred, ytest, stat='mae')
plot_stat_levels(ypred, ytest, stat='mse')
plot_stat_levels(ypred, ytest, stat='rmse')
plot_stat_levels(ypred, ytest, stat='r2')

Cross Validated

# parameter space
n_estimators = [int(x) for x in np.linspace(200, 2_000, 10)]
max_features = sp_randint(1, 8)
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
min_samples_split = sp_randint(2, 11)
min_samples_leaf = sp_randint(1, 4)
bootstrap = [True, False]
criterion = ['gini', 'entropy', 'mse']

param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}
rf_clf = RandomForestRegressor()

# grid search params
n_iter = 1_000
cv = 3
verbose = 2
random_state = 123
n_jobs = -1

rf_rand = RandomizedSearchCV(
    estimator=rf_clf, 
    param_distributions=param_grid, 
    n_iter=n_iter, 
    cv=cv, 
    verbose=verbose, 
    random_state=random_state, 
    n_jobs=n_jobs
)

# fit to model
rf_rand.fit(xtrain_norm, ytrain_norm);

rf_rand.best_estimator_
Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 28 concurrent workers.
/home/emmanuel/.conda/envs/ml4ocn/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py:706: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  "timeout or by a memory leak.", UserWarning
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 309 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 592 tasks      | elapsed: 38.7min
[Parallel(n_jobs=-1)]: Done 957 tasks      | elapsed: 63.9min
rf_rand.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
                      max_features=4, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=4,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
ypred = rf_rand.predict(xtest_norm)
ypred = rf_clf.predict(xtest_norm)
ypred.shape, ytest_norm.shape
((605, 10), (605, 10))
print_stats(ypred, ytest_norm)
R2: -9.242152392080005
MSE: 3.915759598875972e-06
MAE: 0.0009622053611490738
RMSE: 0.0019788278345717627
plot_stat_levels(ypred, ytest_norm)
plot_stat_levels(ypred, ytest_norm, stat='r2', transformer=y_scaler)

Statistics

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def get_ml_stats(ypred, ytest, stat='r2', output='avg'):

    # each sample or average
    if output == 'avg':
        multioutput = 'uniform_average'
    elif output == 'raw':
        multioutput = 'raw_values'
    elif output == 'var':
        multioutput = 'variance_weighted'
    else:
        raise ValueError(f"Unrecognized output param: {output}")

    if stat == 'r2':
        res = r2_score(ypred, ytest, multioutput=multioutput)
    elif stat == 'mse':
        res = mean_squared_error(ypred, ytest, multioutput=multioutput)
    elif stat == 'mae':
        res = mean_absolute_error(ypred, ytest, multioutput=multioutput)
    elif stat == 'rmse':
        res = mean_squared_error(ypred, ytest, multioutput=multioutput)
        res = np.sqrt(res)
    else:
        raise ValueError(f"Unrecognized stat param: {stat}")

    return res 


def print_stats(ypred, ytest, transformer=None):
    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)

    for istat in ['r2', 'mse', 'mae', 'rmse']:

        print(f"{istat.upper()}: {get_ml_stats(ypred, ytest, stat=istat, output='avg')}")
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_stat_levels(ypred, ytest, stat='r2', transformer=None):

    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)

    res = get_ml_stats(ypred, ytest, stat=stat, output='raw')

    fig, ax = plt.subplots()
    ax.plot(res)
    plt.show()