Skip to content

Control Group - Full Pipeline

Steps:

  1. Load Control Group
  2. Split Data (train, test)
  3. Transform Variables
    • Normalize
    • Coordinate Transform: lat, lon -> x,y,z
    • Circular Transform: doy -> radians
  4. Apply ML Models
import sys
import time
import numpy as np
import pandas as pd
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')

from data.make_dataset import DataLoader
from features.pca_features import transform_all, transform_individual
from models.baseline import train_glm_model, train_rf_model
from features.build_features import times_2_cycles, geo_2_cartesian
from visualization.visualize import plot_mo_stats

%load_ext autoreload
%autoreload 2

Step I - Load Data

Input Data

dataloader = DataLoader()

X = dataloader.load_control_data('na')

X = X[dataloader.core_vars + dataloader.loc_vars]
X.head()
sla PAR RHO_WN_412 RHO_WN_443 RHO_WN_490 RHO_WN_555 RHO_WN_670 MLD lat lon doy
0 5.421500 44.7510 0.026754 0.021997 0.016902 0.004928 0.000069 50 12.073372 -27.221187 1
1 5.845199 45.2395 0.025535 0.020452 0.016493 0.004793 0.000119 58 12.199892 -27.042640 6
2 0.756100 50.9293 0.007764 0.009328 0.009720 0.003551 0.000421 52 11.554463 -26.491484 36
3 0.326600 50.8669 0.007064 0.008832 0.009639 0.003629 0.000407 62 11.441130 -26.471429 41
4 0.326300 53.1347 0.020773 0.019439 0.015983 0.005122 0.000562 54 11.427888 -26.447072 46

High-Dimensional Variables

X_temp = dataloader.load_control_temperature('na')
X_dens = dataloader.load_control_density('na')
X_sal = dataloader.load_control_salinity('na')
X_spicy = dataloader.load_control_spicy('na')

X_temp = X_temp.drop(dataloader.meta_vars, axis=1)
X_dens = X_dens.drop(dataloader.meta_vars, axis=1)
X_sal = X_sal.drop(dataloader.meta_vars, axis=1)
X_spicy = X_spicy.drop(dataloader.meta_vars, axis=1)
X_dens.head()
2 3 4 5 6 7 8 9 10 11 ... 268 269 270 271 272 273 274 275 276 277
0 23.711214 23.711194 23.711174 23.710776 23.710981 23.709987 23.710789 23.710921 23.711054 23.710895 ... 27.425111 27.429018 27.432919 27.437502 27.442085 27.446296 27.450505 27.455074 27.456491 27.456548
1 24.023780 24.023994 24.025094 24.025048 24.025654 24.026066 24.027117 24.025887 24.026772 24.026041 ... 27.412035 27.415626 27.419206 27.422661 27.426113 27.430047 27.433977 27.437393 27.440061 27.440118
2 23.913493 23.907041 23.901690 23.904039 23.905319 23.906210 23.907500 23.908285 23.909999 23.911076 ... 27.454562 27.457528 27.460493 27.462936 27.465380 27.469269 27.473241 27.479211 27.481279 27.481336
3 24.396537 24.396876 24.397903 24.397877 24.397416 24.397825 24.397364 24.397929 24.396334 24.397899 ... 27.446174 27.451945 27.457795 27.463156 27.468474 27.473725 27.478971 27.483621 27.484958 27.485015
4 24.351773 24.351864 24.355220 24.355003 24.354738 24.356280 24.359972 24.365122 24.366770 24.367167 ... 27.453507 27.457883 27.462220 27.465383 27.468545 27.472503 27.475551 27.475608 27.475665 27.475723

5 rows × 276 columns

Labels

y = dataloader.load_control_ouputs('na')
y = y.drop(dataloader.meta_vars, axis=1)

X.shape
(3022, 11)

Step II - Coordinate Transform: lat, lon -> x, y, z

X = geo_2_cartesian(X)
X.describe()
sla PAR RHO_WN_412 RHO_WN_443 RHO_WN_490 RHO_WN_555 RHO_WN_670 MLD doy x y z
count 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000
mean -1.023435 42.587315 0.017911 0.015458 0.013138 0.006466 0.000970 92.319656 165.976837 0.424498 -0.278527 0.803373
std 6.241974 13.341398 0.010524 0.007983 0.005180 0.002834 0.000447 145.681095 69.581352 0.166646 0.175314 0.195811
min -47.530300 7.658160 0.000789 0.001000 0.001829 0.002191 0.000004 10.000000 1.000000 0.102689 -0.723914 0.121478
25% -4.710750 33.493225 0.010973 0.010070 0.010000 0.004785 0.000662 20.000000 116.000000 0.333212 -0.409115 0.806248
50% -1.061400 44.568950 0.014952 0.013515 0.012166 0.005929 0.000991 40.000000 161.000000 0.396892 -0.284740 0.869006
75% 2.866325 53.848125 0.021841 0.018442 0.015337 0.007369 0.001151 82.000000 218.000000 0.475388 -0.174620 0.905730
max 29.168600 63.337100 0.063060 0.052012 0.053483 0.036277 0.005287 995.000000 366.000000 0.896169 0.080525 0.980676

Step III - DOY Transform: doy -> sin, cos

times = ['doy']
X = times_2_cycles(X, times)
X.describe()
sla PAR RHO_WN_412 RHO_WN_443 RHO_WN_490 RHO_WN_555 RHO_WN_670 MLD x y z doy_sin doy_cos
count 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000 3022.000000
mean -1.023435 42.587315 0.017911 0.015458 0.013138 0.006466 0.000970 92.319656 0.424498 -0.278527 0.803373 0.158825 -0.450884
std 6.241974 13.341398 0.010524 0.007983 0.005180 0.002834 0.000447 145.681095 0.166646 0.175314 0.195811 0.695279 0.536955
min -47.530300 7.658160 0.000789 0.001000 0.001829 0.002191 0.000004 10.000000 0.102689 -0.723914 0.121478 -0.999991 -0.999963
25% -4.710750 33.493225 0.010973 0.010070 0.010000 0.004785 0.000662 20.000000 0.333212 -0.409115 0.806248 -0.530730 -0.882048
50% -1.061400 44.568950 0.014952 0.013515 0.012166 0.005929 0.000991 40.000000 0.396892 -0.284740 0.869006 0.313107 -0.618671
75% 2.866325 53.848125 0.021841 0.018442 0.015337 0.007369 0.001151 82.000000 0.475388 -0.174620 0.905730 0.826354 -0.200891
max 29.168600 63.337100 0.063060 0.052012 0.053483 0.036277 0.005287 995.000000 0.896169 0.080525 0.980676 0.999991 1.000000

PCA Transform Spectrum

X_red, pca_model = transform_all(X_temp, X_sal, X_dens, X_spicy)

Step IV - Log Transform, y

y_log = np.log(y)

Step V - Train Test

from sklearn.model_selection import train_test_split


random_state = 123
train_size = 0.8

xtrain, xtest, ytrain, ytest = train_test_split(
    np.hstack([X, X_red]),  y, train_size=train_size, random_state=random_state
)

Step VI - Normalize Data

from sklearn.preprocessing import StandardScaler, Normalizer

# Normalize inputs 
x_transformer = StandardScaler(with_mean=True, with_std=True)
# x_transformer = Normalizer(norm='max')

xtrain_norm = x_transformer.fit_transform(xtrain)
xtest_norm = x_transformer.transform(xtest)


# Normalize Outputs
y_transformer = StandardScaler(with_mean=True, with_std=True)

ytrain_norm = y_transformer.fit_transform(ytrain)
ytest_norm = y_transformer.transform(ytest)

Step VII - ML Models

Model I - Random Forest Regression

rf_model = train_rf_model(xtrain_norm, ytrain_norm)
# Predictions
t0 = time.time()
ypred_red = rf_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 0.344
MSE: 0.308
RMSE: 0.555
R2: 0.654 
Time: 0.509 seconds
plot_mo_stats(ytest_norm, ypred_red)

Model II - Generalized Linear Model

gl_model = train_glm_model(xtrain_norm, ytrain_norm)
# Predictions
t0 = time.time()
ypred_red = gl_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 0.576
MSE: 0.667
RMSE: 0.817
R2: 0.267 
Time: 0.00256 seconds
plot_mo_stats(ytest_norm, ypred_red)

Model III - MLPRegressor

from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(
    hidden_layer_sizes=150,
    activation='relu', 
    batch_size=100, 
    max_iter=10_000,
    solver='adam', 
    verbose=False,
    warm_start=True,
    learning_rate='adaptive'
)

t0 = time.time()
mlp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 25.624 seconds
# Predictions
t0 = time.time()
ypred_red = mlp_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 0.462
MSE: 0.452
RMSE: 0.672
R2: 0.479 
Time: 0.00389 seconds
plot_mo_stats(ytest_norm, ypred_red)

Model IV - Gaussian Process Regressor

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern
# define kernel function
kernel = ConstantKernel() * Matern(nu=2.5) + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=False,         # good standard practice --> unless we have normalized before?
    random_state=123,         # reproducibility
    n_restarts_optimizer=0,  # good practice (avoids local minima)
)


# train GP Model
t0 = time.time()
gp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0


print(
    f"Training Time: {t1:.3} seconds"
)
Training Time: 3.8e+02 seconds
# Predictions
t0 = time.time()
ypred_red = gp_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 0.404
MSE: 0.397
RMSE: 0.630
R2: 0.554 
Time: 0.0698 seconds
plot_mo_stats(ytest_norm, ypred_red)

importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# get label indices
labels = [xtest_norm.columns.tolist()[i] for i in indices]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, labels[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), labels, rotation='vertical')
plt.xlim([-1, X.shape[1]])
plt.show()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-103-9ae4150adfa6> in <module>
      5 
      6 # get label indices
----> 7 labels = [xtest_norm.columns.tolist()[i] for i in indices]
      8 
      9 # Print the feature ranking

<ipython-input-103-9ae4150adfa6> in <listcomp>(.0)
      5 
      6 # get label indices
----> 7 labels = [xtest_norm.columns.tolist()[i] for i in indices]
      8 
      9 # Print the feature ranking

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

RF w. Bagging

from sklearn.ensemble import BaggingRegressor
import time
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(
    bootstrap=True, criterion='mse', max_depth=500,
    max_features=4, max_leaf_nodes=None,
    min_impurity_decrease=0.0, min_impurity_split=None,
    min_samples_leaf=1, min_samples_split=4,
    min_weight_fraction_leaf=0.0, n_estimators=1_000,
    n_jobs=-1, oob_score=False, random_state=123, verbose=0,
    warm_start=False
)
bagged_rf_model = BaggingRegressor(
    base_estimator=rf_model, 
)
t0 = time.time()
bagged_rf_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 65.893 seconds
# Predictions
t0 = time.time()
ypred_red = bagged_rf_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 0.647
MSE: 0.751
RMSE: 0.867
R2: 0.239 
Time: 6.11 seconds
# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred_red, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred_red, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred_red, multioutput='raw_values')
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')

# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')

# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')

# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')


plt.tight_layout()
plt.show()


XGBoost

import xgboost
from xgboost import plot_importance
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor


xgbm_model = xgboost.XGBRFRegressor(
    n_estimators=500,
    random_state=123,
)

gbm_model_mo = MultiOutputRegressor(
    xgbm_model,
    n_jobs=-1
)
t0 = time.time()
gbm_model_mo.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 16.831 seconds
# Predictions
t0 = time.time()
ypred_red = gbm_model_mo.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 0.600
MSE: 0.659
RMSE: 0.812
R2: 0.329 
Time: 0.846 seconds