0 baseline control
import sys
import numpy as np
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')
from data.make_dataset import DataLoad
Helper Functions¶
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def get_ml_stats(ypred, ytest, stat='r2', output='avg'):
    # each sample or average
    if output == 'avg':
        multioutput = 'uniform_average'
    elif output == 'raw':
        multioutput = 'raw_values'
    elif output == 'var':
        multioutput = 'variance_weighted'
    else:
        raise ValueError(f"Unrecognized output param: {output}")
    if stat == 'r2':
        res = r2_score(ytest, ypred, multioutput=multioutput)
    elif stat == 'mse':
        res = mean_squared_error(ytest, ypred, multioutput=multioutput)
    elif stat == 'mae':
        res = mean_absolute_error(ytest, ypred, multioutput=multioutput)
    elif stat == 'rmse':
        res = mean_squared_error(ytest, ypred, multioutput=multioutput)
        res = np.sqrt(res)
    else:
        raise ValueError(f"Unrecognized stat param: {stat}")
    return res 
def print_stats(ypred, ytest, transformer=None):
    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)
    for istat in ['r2', 'mse', 'mae', 'rmse']:
        print(f"{istat.upper()}: {get_ml_stats(ypred, ytest, stat=istat, output='avg')}")
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_stat_levels(ypred, ytest, stat='r2', transformer=None):
    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)
    res = get_ml_stats(ypred, ytest, stat=stat, output='raw')
    fig, ax = plt.subplots()
    ax.plot(res)
    plt.show()
dataloader = DataLoad()
X, y = dataloader.load_control_data('na')
X = X[dataloader.core_vars]
y = np.log(y.drop(dataloader.meta_vars, axis=1))
Train Testin Split¶
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    X.values, y.values, train_size=0.8, random_state=123
)
Normalization¶
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# normalize X
x_scaler = StandardScaler()
xtrain_norm = x_scaler.fit_transform(xtrain)
xtest_norm = x_scaler.transform(xtest)
# decompose Y
n_components = 10
random_state = 123
# y_scaler = PCA(n_components=n_components, random_state=random_state)
y_scaler = StandardScaler(with_std=False)
ytrain_norm = y_scaler.fit_transform(ytrain)
ytest_norm = y_scaler.transform(ytest)
Baseline ML Models¶
Linear Regression¶
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
# lin_model = TransformedTargetRegressor(
#     LinearRegression(n_jobs=-1),
#     transformer=y_scaler,
#     check_inverse=False,
# )
lin_model = LinearRegression(n_jobs=-1)
lin_model.fit(xtrain_norm, ytrain_norm);
ypred = lin_model.predict(xtest_norm)
Validation Set¶
yvalid = lin_model.predict(xtrain_norm)
print_stats(yvalid, ytrain)
plot_stat_levels(yvalid, ytrain, stat='mae')
plot_stat_levels(yvalid, ytrain, stat='mse')
plot_stat_levels(yvalid, ytrain, stat='rmse')
plot_stat_levels(yvalid, ytrain, stat='r2')
plot_stat_levels(ypred, ytest, stat='mae', )
plot_stat_levels(ypred, ytest, stat='r2', )
Random Forest Regression¶
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
rf_clf = RandomForestRegressor(
    bootstrap=True, criterion='mse', max_depth=100,
    max_features=4, max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=4,
    min_weight_fraction_leaf=0.0, n_estimators=1000,
    n_jobs=-1, oob_score=False, random_state=None,
    verbose=0, warm_start=False
)
rf_clf.fit(xtrain_norm, ytrain)
Validation Set¶
yvalid = rf_clf.predict(xtrain_norm)
print_stats(yvalid, ytrain)
plot_stat_levels(yvalid, ytrain, stat='mae')
plot_stat_levels(yvalid, ytrain, stat='mse')
plot_stat_levels(yvalid, ytrain, stat='rmse')
plot_stat_levels(yvalid, ytrain, stat='r2')
Test Set¶
ypred = rf_clf.predict(xtest_norm)
print_stats(ypred, ytest)
plot_stat_levels(ypred, ytest, stat='mae')
plot_stat_levels(ypred, ytest, stat='mse')
plot_stat_levels(ypred, ytest, stat='rmse')
plot_stat_levels(ypred, ytest, stat='r2')
Cross Validated¶
# parameter space
n_estimators = [int(x) for x in np.linspace(200, 2_000, 10)]
max_features = sp_randint(1, 8)
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
min_samples_split = sp_randint(2, 11)
min_samples_leaf = sp_randint(1, 4)
bootstrap = [True, False]
criterion = ['gini', 'entropy', 'mse']
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}
rf_clf = RandomForestRegressor()
# grid search params
n_iter = 1_000
cv = 3
verbose = 2
random_state = 123
n_jobs = -1
rf_rand = RandomizedSearchCV(
    estimator=rf_clf, 
    param_distributions=param_grid, 
    n_iter=n_iter, 
    cv=cv, 
    verbose=verbose, 
    random_state=random_state, 
    n_jobs=n_jobs
)
# fit to model
rf_rand.fit(xtrain_norm, ytrain_norm);
rf_rand.best_estimator_
rf_rand.best_estimator_
ypred = rf_rand.predict(xtest_norm)
ypred = rf_clf.predict(xtest_norm)
ypred.shape, ytest_norm.shape
print_stats(ypred, ytest_norm)
plot_stat_levels(ypred, ytest_norm)
plot_stat_levels(ypred, ytest_norm, stat='r2', transformer=y_scaler)
Statistics¶
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def get_ml_stats(ypred, ytest, stat='r2', output='avg'):
    # each sample or average
    if output == 'avg':
        multioutput = 'uniform_average'
    elif output == 'raw':
        multioutput = 'raw_values'
    elif output == 'var':
        multioutput = 'variance_weighted'
    else:
        raise ValueError(f"Unrecognized output param: {output}")
    if stat == 'r2':
        res = r2_score(ypred, ytest, multioutput=multioutput)
    elif stat == 'mse':
        res = mean_squared_error(ypred, ytest, multioutput=multioutput)
    elif stat == 'mae':
        res = mean_absolute_error(ypred, ytest, multioutput=multioutput)
    elif stat == 'rmse':
        res = mean_squared_error(ypred, ytest, multioutput=multioutput)
        res = np.sqrt(res)
    else:
        raise ValueError(f"Unrecognized stat param: {stat}")
    return res 
def print_stats(ypred, ytest, transformer=None):
    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)
    for istat in ['r2', 'mse', 'mae', 'rmse']:
        print(f"{istat.upper()}: {get_ml_stats(ypred, ytest, stat=istat, output='avg')}")
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_stat_levels(ypred, ytest, stat='r2', transformer=None):
    if transformer is not None:
        ypred = transformer.inverse_transform(ypred)
        ytest = transformer.inverse_transform(ytest)
    res = get_ml_stats(ypred, ytest, stat=stat, output='raw')
    fig, ax = plt.subplots()
    ax.plot(res)
    plt.show()