Control Group - Full Pipeline¶
Steps:¶
- Load Control Group
- Split Data (train, test)
- Transform Variables
- Normalize
- Coordinate Transform: lat, lon -> x,y,z
- Circular Transform: doy -> radians
- Apply ML Models
import sys
import time
import numpy as np
import pandas as pd
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')
from data.make_dataset import DataLoader
from features.pca_features import transform_all, transform_individual
from models.baseline import train_glm_model, train_rf_model
from features.build_features import times_2_cycles, geo_2_cartesian
from visualization.visualize import plot_mo_stats
%load_ext autoreload
%autoreload 2
Step I - Load Data¶
Input Data¶
dataloader = DataLoader()
X = dataloader.load_control_data('na')
X = X[dataloader.core_vars + dataloader.loc_vars]
X.head()
High-Dimensional Variables¶
X_temp = dataloader.load_control_temperature('na')
X_dens = dataloader.load_control_density('na')
X_sal = dataloader.load_control_salinity('na')
X_spicy = dataloader.load_control_spicy('na')
X_temp = X_temp.drop(dataloader.meta_vars, axis=1)
X_dens = X_dens.drop(dataloader.meta_vars, axis=1)
X_sal = X_sal.drop(dataloader.meta_vars, axis=1)
X_spicy = X_spicy.drop(dataloader.meta_vars, axis=1)
X_dens.head()
Labels¶
y = dataloader.load_control_ouputs('na')
y = y.drop(dataloader.meta_vars, axis=1)
X.shape
Step II - Coordinate Transform: lat, lon -> x, y, z¶
X = geo_2_cartesian(X)
X.describe()
Step III - DOY Transform: doy -> sin, cos¶
times = ['doy']
X = times_2_cycles(X, times)
X.describe()
PCA Transform Spectrum¶
X_red, pca_model = transform_all(X_temp, X_sal, X_dens, X_spicy)
Step IV - Log Transform, y¶
y_log = np.log(y)
Step V - Train Test¶
from sklearn.model_selection import train_test_split
random_state = 123
train_size = 0.8
xtrain, xtest, ytrain, ytest = train_test_split(
np.hstack([X, X_red]), y, train_size=train_size, random_state=random_state
)
Step VI - Normalize Data¶
from sklearn.preprocessing import StandardScaler, Normalizer
# Normalize inputs
x_transformer = StandardScaler(with_mean=True, with_std=True)
# x_transformer = Normalizer(norm='max')
xtrain_norm = x_transformer.fit_transform(xtrain)
xtest_norm = x_transformer.transform(xtest)
# Normalize Outputs
y_transformer = StandardScaler(with_mean=True, with_std=True)
ytrain_norm = y_transformer.fit_transform(ytrain)
ytest_norm = y_transformer.transform(ytest)
Step VII - ML Models¶
Model I - Random Forest Regression¶
rf_model = train_rf_model(xtrain_norm, ytrain_norm)
# Predictions
t0 = time.time()
ypred_red = rf_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
plot_mo_stats(ytest_norm, ypred_red)
Model II - Generalized Linear Model¶
gl_model = train_glm_model(xtrain_norm, ytrain_norm)
# Predictions
t0 = time.time()
ypred_red = gl_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
plot_mo_stats(ytest_norm, ypred_red)
Model III - MLPRegressor¶
from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(
hidden_layer_sizes=150,
activation='relu',
batch_size=100,
max_iter=10_000,
solver='adam',
verbose=False,
warm_start=True,
learning_rate='adaptive'
)
t0 = time.time()
mlp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred_red = mlp_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
plot_mo_stats(ytest_norm, ypred_red)
Model IV - Gaussian Process Regressor¶
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern
# define kernel function
kernel = ConstantKernel() * Matern(nu=2.5) + WhiteKernel()
# define GP model
gp_model = GaussianProcessRegressor(
kernel=kernel, # kernel function (very important)
normalize_y=False, # good standard practice --> unless we have normalized before?
random_state=123, # reproducibility
n_restarts_optimizer=0, # good practice (avoids local minima)
)
# train GP Model
t0 = time.time()
gp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3} seconds"
)
# Predictions
t0 = time.time()
ypred_red = gp_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
plot_mo_stats(ytest_norm, ypred_red)
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# get label indices
labels = [xtest_norm.columns.tolist()[i] for i in indices]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %s (%f)" % (f + 1, labels[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), labels, rotation='vertical')
plt.xlim([-1, X.shape[1]])
plt.show()
RF w. Bagging¶
from sklearn.ensemble import BaggingRegressor
import time
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(
bootstrap=True, criterion='mse', max_depth=500,
max_features=4, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=4,
min_weight_fraction_leaf=0.0, n_estimators=1_000,
n_jobs=-1, oob_score=False, random_state=123, verbose=0,
warm_start=False
)
bagged_rf_model = BaggingRegressor(
base_estimator=rf_model,
)
t0 = time.time()
bagged_rf_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred_red = bagged_rf_model.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred_red, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred_red, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred_red, multioutput='raw_values')
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')
# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')
# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')
# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')
plt.tight_layout()
plt.show()
XGBoost¶
import xgboost
from xgboost import plot_importance
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
xgbm_model = xgboost.XGBRFRegressor(
n_estimators=500,
random_state=123,
)
gbm_model_mo = MultiOutputRegressor(
xgbm_model,
n_jobs=-1
)
t0 = time.time()
gbm_model_mo.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred_red = gbm_model_mo.predict(xtest_norm)
t1 = time.time() - t0
#ypred = pca_model.inverse_transform(ypred_red)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred_red, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred_red, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred_red, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)