Skip to content

Revisit

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time as time
import statsmodels.api as sm
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
!ls /media/disk/erc/papers/2019_ML_OCN/data/august
BBP_OUTPUT_NA.csv   MATRIX_SPICINESS_NA.txt  Y_OUTPUT_LOG_NA.csv
MATRIX_DENS_NA.txt  MATRIX_TEMP_NA.txt
MATRIX_PSAL_NA.txt  X_INPUT_NA.csv
data_path = '/media/disk/erc/papers/2019_ML_OCN/data/ana_temp_data_NA/'

X1 = pd.read_csv(f"{data_path}X_INPUT_NA.csv")

X_meta_cols = [
    'wmo', 'n_cycle', 'lat', 'lon', 'doy',
#     'RHO_WN_412', 'RHO_WN_443', 'RHO_WN_490', 'RHO_WN_555', 'RHO_WN_670'
]

X_meta = X1[X_meta_cols]
X1 = X1.drop(X_meta_cols, axis=1)
def plot_depth_errors(r2, mae, mse, rmse):
    # Plots
    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

    # R2 Values
    ax[0,0].plot(r2)
    ax[0,0].set_xlabel('Depths (Pressure)')
    ax[0,0].set_ylabel('R2')

    # MAE
    ax[0,1].plot(mae)
    ax[0,1].set_xlabel('Depths (Pressure)')
    ax[0,1].set_ylabel('MAE')

    # MSE
    ax[1,0].plot(mse)
    ax[1,0].set_xlabel('Depths (Pressure)')
    ax[1,0].set_ylabel('MSE')

    # RMSE
    ax[1,1].plot(rmse)
    ax[1,1].set_xlabel('Depths (Pressure)')
    ax[1,1].set_ylabel('RMSE')


    plt.tight_layout()
    plt.show()

Inputs I

For these inputs we have the following variables:

  • Sea Level Anomaly (sla)
  • Photosyntetically Available Radiation (PAR)
  • The spectral values
    • 412, 443, 490, 555, 670
  • Mixed Layer Depth (MLD)
  • Longitude (lon)
  • Latitude (lat)
  • Day of the Year (doy)
X1.shape
(3113, 8)

Inputs II

For these inputs, we will be taking the raw values for temperature, salinity, density and spiciness.

X_sal = pd.read_csv(f"{data_path}MATRIX_PSAL_NA.txt", sep=' ', header=None)
X_temp = pd.read_csv(f"{data_path}MATRIX_TEMP_NA.txt", sep=' ', header=None)
X_dens = pd.read_csv(f"{data_path}MATRIX_DENS_NA.txt", sep=' ', header=None)
X_spi = pd.read_csv(f"{data_path}MATRIX_SPICINESS_NA.txt", sep=' ', header=None)

X_sal.shape, X_temp.shape, X_dens.shape, X_spi.shape

X2 = pd.concat((X_sal, X_temp, X_dens, X_spi), axis=1)
X = np.hstack([X1, X2])
X.shape
(3113, 1120)
# perform PCA on each of the datasets
n_components = 20
random_state = 123
pca_models = list()
X2 = list()

for iX in [X_sal, X_temp, X_dens, X_spi]:

    # initiate PCA
    pca_clf = PCA(n_components=n_components, random_state=random_state)

    # perform PCA
    pca_clf.fit(iX)

    # transform data
    X2.append(pca_clf.transform(iX))

    # save pca_model
    pca_models.append(pca_clf)


X2 = np.hstack(X2)
X = np.hstack([X1, X2])
X.shape
(3113, 1120)

Outputs

ylog.shape
(3113, 276)
y = pd.read_csv(f"{data_path}BBP_OUTPUT_NA.csv")
y_meta_cols = ['wmo', 'n_cycle']
y_meta = y[y_meta_cols]
y = y.drop(y_meta_cols, axis=1)

# quantile transform
n_quantiles         = 1000
output_distribution = 'normal'
random_state        = 123
subsample           = 2000
y_quant_clf = QuantileTransformer(
    n_quantiles=n_quantiles,
    output_distribution=output_distribution,
    random_state=random_state,
    subsample=subsample
)

ylog = np.log(10* y)
# ylog = y_quant_clf.fit_transform(ylog)


ylog.shape


# ylog = pd.read_csv(f"{data_path}Y_OUTPUT_LOG_NA.csv")
# # y_meta_cols = ['wmo', 'n_cycle']
# # y_meta = ylog[y_meta_cols]
# # ylog = ylog.drop(ylog, axis=1)

# # ylog.shape
(3113, 276)

Train-Test Split

# training and testing split
train_size = 0.8
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(X, ylog, train_size=train_size, random_state=random_state)

Normalize Data

# Normalize Inputs
x_normalizer = StandardScaler(with_mean=True, with_std=True)
xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)

# Normalize Outputs
y_normalizer = StandardScaler(with_std=False)
ytrain_norm = y_normalizer.fit_transform(ytrain)
ytest_norm = y_normalizer.transform(ytest)

# PCA Transform
n_components = 50
random_state = 123

# initialize PCA model
pca_x_clf = PCA(n_components=n_components, random_state=random_state)

# fit and transform data
xtrain_norm = pca_x_clf.fit_transform(xtrain_norm)
xtest_norm = pca_x_clf.transform(xtest_norm)

# PCA Transform
n_components = 50
random_state = 123

# initialize PCA model
pca_y_clf = PCA(n_components=n_components, random_state=random_state)

# fit and transform data
ytrain_norm = pca_y_clf.fit_transform(ytrain_norm)

# # Normalize Outputs
# y_normalizer = StandardScaler(with_std=False)
# ytrain_norm = y_normalizer.fit_transform(ytrain)
# ytest_norm = y_normalizer.transform(ytest)

ML I - Linear Regression

linear_model = LinearRegression()
t0 = time.time()
linear_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
ypred_red = linear_model.predict(xtest_norm)
ypred_original = pca_y_clf.inverse_transform(ypred_red)

# Get Stats
mae = mean_absolute_error(ypred_original, ytest_norm, multioutput='uniform_average')
mse = mean_squared_error(ypred_original, ytest_norm, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred_original, ytest_norm , multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)


fig, ax = plt.subplots()
#check this
plt.scatter(ytest_norm, ypred_original,alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
MAE: 0.236
MSE: 0.115
RMSE: 0.340
R2: -0.764 
Time: 0.0152 seconds
# Get Stats
mae_raw = mean_absolute_error(ypred_original, ytest_norm, multioutput='raw_values')
mse_raw = mean_squared_error(ypred_original, ytest_norm, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred_original, ytest_norm,multioutput='raw_values')

plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)

Model II - Random Forest

from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(
    n_estimators=1000, 
    criterion='mse',
    n_jobs=-1,
    random_state=123,
    warm_start=True,
    verbose=0
)


t0 = time.time()
rf_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 13.361 seconds
# Predictions
t0 = time.time()
ypred_red = rf_model.predict(xtest_norm)
t1 = time.time() - t0

ypred_original = pca_y_clf.inverse_transform(ypred_red)

# Get Stats
mae = mean_absolute_error(ypred_original, ytest_norm, multioutput='uniform_average')
mse = mean_squared_error(ypred_original, ytest_norm, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred_original, ytest_norm,multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)


fig, ax = plt.subplots()
#check this
plt.scatter(ytest_norm, ypred_original, alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
MAE: 0.154
MSE: 0.057
RMSE: 0.238
R2: 0.368 
Time: 0.307 seconds
# Get Stats
mae_raw = mean_absolute_error(ypred_original, ytest_norm, multioutput='raw_values')
mse_raw = mean_squared_error(ypred_original, ytest_norm, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred_original, ytest_norm, multioutput='raw_values')

plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)

ML III - NN

from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(
    hidden_layer_sizes=150,
    activation='relu', 
    batch_size=200, 
    max_iter=5000,
    solver='adam', 
    verbose=False,
    warm_start=False,
    learning_rate='adaptive'
)

t0 = time.time()
mlp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 18.231 seconds
# Predictions
t0 = time.time()
ypred_red = mlp_model.predict(xtest_norm)
t1 = time.time() - t0

ypred_original = pca_y_clf.inverse_transform(ypred_red)

# Get Stats
mae = mean_absolute_error(ypred_original, ytest, multioutput='uniform_average')
mse = mean_squared_error(ypred_original, ytest, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred_original, ytest,multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
print(r2)


fig, ax = plt.subplots()
#check this
plt.scatter(ytest, ypred_original,alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
MAE: 0.197
MSE: 0.086
RMSE: 0.294
R2: 0.214 
Time: 0.00242 seconds
0.21389702126413807
# Get Stats
mae_raw = mean_absolute_error(ypred_original, ytest, multioutput='raw_values')
mse_raw = mean_squared_error(ypred_original, ytest, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred_original, ytest,multioutput='raw_values')

plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)