Revisit
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
import time as time
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
!ls /media/disk/erc/papers/2019_ML_OCN/data/august
data_path = '/media/disk/erc/papers/2019_ML_OCN/data/ana_temp_data_NA/'
X1 = pd.read_csv(f"{data_path}X_INPUT_NA.csv")
X_meta_cols = [
'wmo', 'n_cycle', 'lat', 'lon', 'doy',
# 'RHO_WN_412', 'RHO_WN_443', 'RHO_WN_490', 'RHO_WN_555', 'RHO_WN_670'
]
X_meta = X1[X_meta_cols]
X1 = X1.drop(X_meta_cols, axis=1)
def plot_depth_errors(r2, mae, mse, rmse):
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
# R2 Values
ax[0,0].plot(r2)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')
# MAE
ax[0,1].plot(mae)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')
# MSE
ax[1,0].plot(mse)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')
# RMSE
ax[1,1].plot(rmse)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')
plt.tight_layout()
plt.show()
Inputs I¶
For these inputs we have the following variables:
- Sea Level Anomaly (sla)
- Photosyntetically Available Radiation (PAR)
- The spectral values
- 412, 443, 490, 555, 670
- Mixed Layer Depth (MLD)
- Longitude (lon)
- Latitude (lat)
- Day of the Year (doy)
X1.shape
Inputs II¶
For these inputs, we will be taking the raw values for temperature, salinity, density and spiciness.
X_sal = pd.read_csv(f"{data_path}MATRIX_PSAL_NA.txt", sep=' ', header=None)
X_temp = pd.read_csv(f"{data_path}MATRIX_TEMP_NA.txt", sep=' ', header=None)
X_dens = pd.read_csv(f"{data_path}MATRIX_DENS_NA.txt", sep=' ', header=None)
X_spi = pd.read_csv(f"{data_path}MATRIX_SPICINESS_NA.txt", sep=' ', header=None)
X_sal.shape, X_temp.shape, X_dens.shape, X_spi.shape
X2 = pd.concat((X_sal, X_temp, X_dens, X_spi), axis=1)
X = np.hstack([X1, X2])
X.shape
# perform PCA on each of the datasets
n_components = 20
random_state = 123
pca_models = list()
X2 = list()
for iX in [X_sal, X_temp, X_dens, X_spi]:
# initiate PCA
pca_clf = PCA(n_components=n_components, random_state=random_state)
# perform PCA
pca_clf.fit(iX)
# transform data
X2.append(pca_clf.transform(iX))
# save pca_model
pca_models.append(pca_clf)
X2 = np.hstack(X2)
X = np.hstack([X1, X2])
y = pd.read_csv(f"{data_path}BBP_OUTPUT_NA.csv")
y_meta_cols = ['wmo', 'n_cycle']
y_meta = y[y_meta_cols]
y = y.drop(y_meta_cols, axis=1)
# quantile transform
n_quantiles = 1000
output_distribution = 'normal'
random_state = 123
subsample = 2000
y_quant_clf = QuantileTransformer(
n_quantiles=n_quantiles,
output_distribution=output_distribution,
random_state=random_state,
subsample=subsample
)
ylog = np.log(10* y)
# ylog = y_quant_clf.fit_transform(ylog)
ylog.shape
# ylog = pd.read_csv(f"{data_path}Y_OUTPUT_LOG_NA.csv")
# # y_meta_cols = ['wmo', 'n_cycle']
# # y_meta = ylog[y_meta_cols]
# # ylog = ylog.drop(ylog, axis=1)
# # ylog.shape
Train-Test Split¶
# training and testing split
train_size = 0.8
random_state = 123
xtrain, xtest, ytrain, ytest = train_test_split(X, ylog, train_size=train_size, random_state=random_state)
Normalize Data¶
# Normalize Inputs
x_normalizer = StandardScaler(with_mean=True, with_std=True)
xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)
# Normalize Outputs
y_normalizer = StandardScaler(with_std=False)
ytrain_norm = y_normalizer.fit_transform(ytrain)
ytest_norm = y_normalizer.transform(ytest)
# PCA Transform
n_components = 50
random_state = 123
# initialize PCA model
pca_x_clf = PCA(n_components=n_components, random_state=random_state)
# fit and transform data
xtrain_norm = pca_x_clf.fit_transform(xtrain_norm)
xtest_norm = pca_x_clf.transform(xtest_norm)
# PCA Transform
n_components = 50
random_state = 123
# initialize PCA model
pca_y_clf = PCA(n_components=n_components, random_state=random_state)
# fit and transform data
ytrain_norm = pca_y_clf.fit_transform(ytrain_norm)
# # Normalize Outputs
# y_normalizer = StandardScaler(with_std=False)
# ytrain_norm = y_normalizer.fit_transform(ytrain)
# ytest_norm = y_normalizer.transform(ytest)
ML I - Linear Regression¶
linear_model = LinearRegression()
t0 = time.time()
linear_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
ypred_red = linear_model.predict(xtest_norm)
ypred_original = pca_y_clf.inverse_transform(ypred_red)
# Get Stats
mae = mean_absolute_error(ypred_original, ytest_norm, multioutput='uniform_average')
mse = mean_squared_error(ypred_original, ytest_norm, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred_original, ytest_norm , multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
fig, ax = plt.subplots()
#check this
plt.scatter(ytest_norm, ypred_original,alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
# Get Stats
mae_raw = mean_absolute_error(ypred_original, ytest_norm, multioutput='raw_values')
mse_raw = mean_squared_error(ypred_original, ytest_norm, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred_original, ytest_norm,multioutput='raw_values')
plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)
Model II - Random Forest¶
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(
n_estimators=1000,
criterion='mse',
n_jobs=-1,
random_state=123,
warm_start=True,
verbose=0
)
t0 = time.time()
rf_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred_red = rf_model.predict(xtest_norm)
t1 = time.time() - t0
ypred_original = pca_y_clf.inverse_transform(ypred_red)
# Get Stats
mae = mean_absolute_error(ypred_original, ytest_norm, multioutput='uniform_average')
mse = mean_squared_error(ypred_original, ytest_norm, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred_original, ytest_norm,multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
fig, ax = plt.subplots()
#check this
plt.scatter(ytest_norm, ypred_original, alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
# Get Stats
mae_raw = mean_absolute_error(ypred_original, ytest_norm, multioutput='raw_values')
mse_raw = mean_squared_error(ypred_original, ytest_norm, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred_original, ytest_norm, multioutput='raw_values')
plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)
ML III - NN¶
from sklearn.neural_network import MLPRegressor
mlp_model = MLPRegressor(
hidden_layer_sizes=150,
activation='relu',
batch_size=200,
max_iter=5000,
solver='adam',
verbose=False,
warm_start=False,
learning_rate='adaptive'
)
t0 = time.time()
mlp_model.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred_red = mlp_model.predict(xtest_norm)
t1 = time.time() - t0
ypred_original = pca_y_clf.inverse_transform(ypred_red)
# Get Stats
mae = mean_absolute_error(ypred_original, ytest, multioutput='uniform_average')
mse = mean_squared_error(ypred_original, ytest, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred_original, ytest,multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
print(r2)
fig, ax = plt.subplots()
#check this
plt.scatter(ytest, ypred_original,alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
# Get Stats
mae_raw = mean_absolute_error(ypred_original, ytest, multioutput='raw_values')
mse_raw = mean_squared_error(ypred_original, ytest, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred_original, ytest,multioutput='raw_values')
plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)