Control Group - Full Pipeline¶

Steps:¶

Load Control Group
Split Data (train, test)
Transform Variables
- Normalize
- Coordinate Transform: lat, lon -> x,y,z
- Circular Transform: doy -> radians
Apply ML Models

import sys
import time
import numpy as np
import pandas as pd
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')

from data.make_dataset import DataLoader
from features.pca_features import transform_all, transform_individual
from models.baseline import train_glm_model, train_rf_model
from features.build_features import times_2_cycles, geo_2_cartesian
from visualization.visualize import plot_mo_stats

%load_ext autoreload
%autoreload 2

Step I - Load Data¶

Input Data¶

dataloader = DataLoader()

X = dataloader.load_control_data('na')

X = X[dataloader.core_vars + dataloader.loc_vars]

X.head()

	sla	PAR	RHO_WN_412	RHO_WN_443	RHO_WN_490	RHO_WN_555	RHO_WN_670	MLD	lat	lon	doy
0	5.421500	44.7510	0.026754	0.021997	0.016902	0.004928	0.000069	50	12.073372	-27.221187	1
1	5.845199	45.2395	0.025535	0.020452	0.016493	0.004793	0.000119	58	12.199892	-27.042640	6
2	0.756100	50.9293	0.007764	0.009328	0.009720	0.003551	0.000421	52	11.554463	-26.491484	36
3	0.326600	50.8669	0.007064	0.008832	0.009639	0.003629	0.000407	62	11.441130	-26.471429	41
4	0.326300	53.1347	0.020773	0.019439	0.015983	0.005122	0.000562	54	11.427888	-26.447072	46

High-Dimensional Variables¶

X_temp = dataloader.load_control_temperature('na')
X_dens = dataloader.load_control_density('na')
X_sal = dataloader.load_control_salinity('na')
X_spicy = dataloader.load_control_spicy('na')

X_temp = X_temp.drop(dataloader.meta_vars, axis=1)
X_dens = X_dens.drop(dataloader.meta_vars, axis=1)
X_sal = X_sal.drop(dataloader.meta_vars, axis=1)
X_spicy = X_spicy.drop(dataloader.meta_vars, axis=1)

X_dens.head()

	2	3	4	5	6	7	8	9	10	11	...	268	269	270	271	272	273	274	275	276	277
0	23.711214	23.711194	23.711174	23.710776	23.710981	23.709987	23.710789	23.710921	23.711054	23.710895	...	27.425111	27.429018	27.432919	27.437502	27.442085	27.446296	27.450505	27.455074	27.456491	27.456548
1	24.023780	24.023994	24.025094	24.025048	24.025654	24.026066	24.027117	24.025887	24.026772	24.026041	...	27.412035	27.415626	27.419206	27.422661	27.426113	27.430047	27.433977	27.437393	27.440061	27.440118
2	23.913493	23.907041	23.901690	23.904039	23.905319	23.906210	23.907500	23.908285	23.909999	23.911076	...	27.454562	27.457528	27.460493	27.462936	27.465380	27.469269	27.473241	27.479211	27.481279	27.481336
3	24.396537	24.396876	24.397903	24.397877	24.397416	24.397825	24.397364	24.397929	24.396334	24.397899	...	27.446174	27.451945	27.457795	27.463156	27.468474	27.473725	27.478971	27.483621	27.484958	27.485015
4	24.351773	24.351864	24.355220	24.355003	24.354738	24.356280	24.359972	24.365122	24.366770	24.367167	...	27.453507	27.457883	27.462220	27.465383	27.468545	27.472503	27.475551	27.475608	27.475665	27.475723

5 rows × 276 columns

Labels¶

y = dataloader.load_control_ouputs('na')
y = y.drop(dataloader.meta_vars, axis=1)

X.shape

(3022, 11)

Step II - Coordinate Transform: lat, lon -> x, y, z¶

X = geo_2_cartesian(X)
X.describe()

	sla	PAR	RHO_WN_412	RHO_WN_443	RHO_WN_490	RHO_WN_555	RHO_WN_670	MLD	doy	x	y	z
count	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000
mean	-1.023435	42.587315	0.017911	0.015458	0.013138	0.006466	0.000970	92.319656	165.976837	0.424498	-0.278527	0.803373
std	6.241974	13.341398	0.010524	0.007983	0.005180	0.002834	0.000447	145.681095	69.581352	0.166646	0.175314	0.195811
min	-47.530300	7.658160	0.000789	0.001000	0.001829	0.002191	0.000004	10.000000	1.000000	0.102689	-0.723914	0.121478
25%	-4.710750	33.493225	0.010973	0.010070	0.010000	0.004785	0.000662	20.000000	116.000000	0.333212	-0.409115	0.806248
50%	-1.061400	44.568950	0.014952	0.013515	0.012166	0.005929	0.000991	40.000000	161.000000	0.396892	-0.284740	0.869006
75%	2.866325	53.848125	0.021841	0.018442	0.015337	0.007369	0.001151	82.000000	218.000000	0.475388	-0.174620	0.905730
max	29.168600	63.337100	0.063060	0.052012	0.053483	0.036277	0.005287	995.000000	366.000000	0.896169	0.080525	0.980676

Step III - DOY Transform: doy -> sin, cos¶

times = ['doy']
X = times_2_cycles(X, times)
X.describe()

	sla	PAR	RHO_WN_412	RHO_WN_443	RHO_WN_490	RHO_WN_555	RHO_WN_670	MLD	x	y	z	doy_sin	doy_cos
count	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000	3022.000000
mean	-1.023435	42.587315	0.017911	0.015458	0.013138	0.006466	0.000970	92.319656	0.424498	-0.278527	0.803373	0.158825	-0.450884
std	6.241974	13.341398	0.010524	0.007983	0.005180	0.002834	0.000447	145.681095	0.166646	0.175314	0.195811	0.695279	0.536955
min	-47.530300	7.658160	0.000789	0.001000	0.001829	0.002191	0.000004	10.000000	0.102689	-0.723914	0.121478	-0.999991	-0.999963
25%	-4.710750	33.493225	0.010973	0.010070	0.010000	0.004785	0.000662	20.000000	0.333212	-0.409115	0.806248	-0.530730	-0.882048
50%	-1.061400	44.568950	0.014952	0.013515	0.012166	0.005929	0.000991	40.000000	0.396892	-0.284740	0.869006	0.313107	-0.618671
75%	2.866325	53.848125	0.021841	0.018442	0.015337	0.007369	0.001151	82.000000	0.475388	-0.174620	0.905730	0.826354	-0.200891
max	29.168600	63.337100	0.063060	0.052012	0.053483	0.036277	0.005287	995.000000	0.896169	0.080525	0.980676	0.999991	1.000000

PCA Transform Spectrum¶

X_red, pca_model = transform_all(X_temp, X_sal, X_dens, X_spicy)

Step IV - Log Transform, y¶

y_log = np.log(y)

Step V - Train Test¶

from sklearn.model_selection import train_test_split


random_state = 123
train_size = 0.8

xtrain, xtest, ytrain, ytest = train_test_split(
    np.hstack([X, X_red]),  y, train_size=train_size, random_state=random_state
)

Step VI - Normalize Data¶

from sklearn.preprocessing import StandardScaler, Normalizer

# Normalize inputs 
x_transformer = StandardScaler(with_mean=True, with_std=True)
# x_transformer = Normalizer(norm='max')

xtrain_norm = x_transformer.fit_transform(xtrain)
xtest_norm = x_transformer.transform(xtest)


# Normalize Outputs
y_transformer = StandardScaler(with_mean=True, with_std=True)

ytrain_norm = y_transformer.fit_transform(ytrain)
ytest_norm = y_transformer.transform(ytest)

Step VII - ML Models¶

Model I - Random Forest Regression¶

rf_model = train_rf_model(xtrain_norm, ytrain_norm)

# Predictions
t0 = time.time()
ypred_red = rf_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.344
MSE: 0.308
RMSE: 0.555
R2: 0.654 
Time: 0.509 seconds

plot_mo_stats(ytest_norm, ypred_red)

Model II - Generalized Linear Model¶

gl_model = train_glm_model(xtrain_norm, ytrain_norm)

# Predictions
t0 = time.time()
ypred_red = gl_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.576
MSE: 0.667
RMSE: 0.817
R2: 0.267 
Time: 0.00256 seconds

plot_mo_stats(ytest_norm, ypred_red)

Model III - MLPRegressor¶

from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(
    hidden_layer_sizes=150,
    activation='relu', 
    batch_size=100, 
    max_iter=10_000,
    solver='adam', 
    verbose=False,
    warm_start=True,
    learning_rate='adaptive'
)

t0 = time.time()
mlp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)

Training Time: 25.624 seconds

# Predictions
t0 = time.time()
ypred_red = mlp_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)

# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.462
MSE: 0.452
RMSE: 0.672
R2: 0.479 
Time: 0.00389 seconds

plot_mo_stats(ytest_norm, ypred_red)

Model IV - Gaussian Process Regressor¶

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern

# define kernel function
kernel = ConstantKernel() * Matern(nu=2.5) + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=False,         # good standard practice --> unless we have normalized before?
    random_state=123,         # reproducibility
    n_restarts_optimizer=0,  # good practice (avoids local minima)
)


# train GP Model
t0 = time.time()
gp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0


print(
    f"Training Time: {t1:.3} seconds"
)

Training Time: 3.8e+02 seconds

# Predictions
t0 = time.time()
ypred_red = gp_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)

# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.404
MSE: 0.397
RMSE: 0.630
R2: 0.554 
Time: 0.0698 seconds

plot_mo_stats(ytest_norm, ypred_red)

importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# get label indices
labels = [xtest_norm.columns.tolist()[i] for i in indices]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, labels[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), labels, rotation='vertical')
plt.xlim([-1, X.shape[1]])
plt.show()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-103-9ae4150adfa6> in <module>
      5 
      6 # get label indices
----> 7 labels = [xtest_norm.columns.tolist()[i] for i in indices]
      8 
      9 # Print the feature ranking

<ipython-input-103-9ae4150adfa6> in <listcomp>(.0)
      5 
      6 # get label indices
----> 7 labels = [xtest_norm.columns.tolist()[i] for i in indices]
      8 
      9 # Print the feature ranking

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

RF w. Bagging¶

from sklearn.ensemble import BaggingRegressor

import time
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(
    bootstrap=True, criterion='mse', max_depth=500,
    max_features=4, max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=4,
    min_weight_fraction_leaf=0.0, n_estimators=1_000,
    n_jobs=-1, oob_score=False, random_state=123, verbose=0,
    warm_start=False
)
bagged_rf_model = BaggingRegressor(
    base_estimator=rf_model, 
)
t0 = time.time()
bagged_rf_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)

Training Time: 65.893 seconds

# Predictions
t0 = time.time()
ypred_red = bagged_rf_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.647
MSE: 0.751
RMSE: 0.867
R2: 0.239 
Time: 6.11 seconds

# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred_red, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred_red, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred_red, multioutput='raw_values')

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')

# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')

# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')

# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')


plt.tight_layout()
plt.show()

XGBoost¶

import xgboost
from xgboost import plot_importance

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor


xgbm_model = xgboost.XGBRFRegressor(
    n_estimators=500,
    random_state=123,
)

gbm_model_mo = MultiOutputRegressor(
    xgbm_model,
    n_jobs=-1
)
t0 = time.time()
gbm_model_mo.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)

Training Time: 16.831 seconds

# Predictions
t0 = time.time()
ypred_red = gbm_model_mo.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.600
MSE: 0.659
RMSE: 0.812
R2: 0.329 
Time: 0.846 seconds