Skip to content

Trial2 gps

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time as time
import statsmodels.api as sm

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
!ls /media/disk/erc/papers/2019_ML_OCN/data
raph_temp_data_NA
data_path = '/media/disk/erc/papers/2019_ML_OCN/data/raph_temp_data_NA/'

# Import data
X = pd.read_csv(f"{data_path}X_INPUT_LOG_PCA_NA.csv")


X.describe()
wmo n_cycle sla PAR RHO_WN_412 RHO_WN_443 RHO_WN_490 RHO_WN_555 RHO_WN_670 1 2 3 4 5 6 7 8 9 10
count 3.181000e+03 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03
mean 6.790443e+06 110.673688 -1.299802 42.429057 -1.817285 -1.864362 -1.914530 -2.221160 -3.063053 0.000000 4.717593e-15 2.029994e-14 1.508200e-14 2.306974e-14 5.736164e-15 -2.426701e-14 1.656518e-14 -7.112128e-15 -2.546428e-14
std 4.610381e+05 96.074956 6.367596 13.414969 0.250583 0.218535 0.175765 0.158956 0.228308 75.273124 1.649043e+01 9.496344e+00 5.964048e+00 3.453349e+00 3.331140e+00 2.827899e+00 2.004328e+00 1.406174e+00 1.329474e+00
min 3.902122e+06 1.000000 -47.530300 7.658160 -3.103068 -3.000000 -2.737900 -2.659280 -5.443568 -106.416803 -4.247265e+01 -2.240745e+01 -1.315154e+01 -1.076247e+01 -1.592344e+01 -1.017540e+01 -8.439650e+00 -7.188060e+00 -6.281040e+00
25% 6.901472e+06 34.000000 -5.063600 33.044000 -1.958607 -1.993085 -2.000000 -2.322931 -3.177302 -42.257307 -9.416668e+00 -4.582448e+00 -4.458510e+00 -1.988276e+00 -1.728534e+00 -1.456275e+00 -1.037335e+00 -7.316116e-01 -8.489055e-01
50% 6.901516e+06 78.000000 -1.374300 44.421900 -1.826773 -1.868150 -1.913993 -2.228608 -3.004121 -18.412567 -4.585460e+00 -1.930680e+00 -1.859815e+00 9.205945e-02 -3.006696e-01 1.885095e-01 8.494018e-02 6.395386e-02 -8.041814e-02
75% 6.901526e+06 170.000000 2.724100 53.844000 -1.661490 -1.734515 -1.814458 -2.132773 -2.942182 19.239909 1.106912e+01 4.226796e+00 2.172497e+00 1.752604e+00 1.491635e+00 1.496154e+00 1.139540e+00 7.388190e-01 8.090223e-01
max 6.903273e+06 379.000000 29.168600 63.337100 -1.200247 -1.283895 -1.271788 -1.440369 -2.276760 218.944064 4.854796e+01 4.337788e+01 2.713946e+01 2.007353e+01 2.145210e+01 1.049357e+01 1.378756e+01 8.047028e+00 6.355754e+00

X = X.iloc[:, 2:]
y = pd.read_csv(f"{data_path}Y_OUTPUT_LOG_NA.csv")
y = y.iloc[:, 2:]

sla PAR RHO_WN_412 RHO_WN_443 RHO_WN_490 RHO_WN_555 RHO_WN_670 1 2 3 4 5 6 7 8 9 10
count 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03 3.181000e+03
mean -1.299802 42.429057 -1.817285 -1.864362 -1.914530 -2.221160 -3.063053 0.000000 4.717593e-15 2.029994e-14 1.508200e-14 2.306974e-14 5.736164e-15 -2.426701e-14 1.656518e-14 -7.112128e-15 -2.546428e-14
std 6.367596 13.414969 0.250583 0.218535 0.175765 0.158956 0.228308 75.273124 1.649043e+01 9.496344e+00 5.964048e+00 3.453349e+00 3.331140e+00 2.827899e+00 2.004328e+00 1.406174e+00 1.329474e+00
min -47.530300 7.658160 -3.103068 -3.000000 -2.737900 -2.659280 -5.443568 -106.416803 -4.247265e+01 -2.240745e+01 -1.315154e+01 -1.076247e+01 -1.592344e+01 -1.017540e+01 -8.439650e+00 -7.188060e+00 -6.281040e+00
25% -5.063600 33.044000 -1.958607 -1.993085 -2.000000 -2.322931 -3.177302 -42.257307 -9.416668e+00 -4.582448e+00 -4.458510e+00 -1.988276e+00 -1.728534e+00 -1.456275e+00 -1.037335e+00 -7.316116e-01 -8.489055e-01
50% -1.374300 44.421900 -1.826773 -1.868150 -1.913993 -2.228608 -3.004121 -18.412567 -4.585460e+00 -1.930680e+00 -1.859815e+00 9.205945e-02 -3.006696e-01 1.885095e-01 8.494018e-02 6.395386e-02 -8.041814e-02
75% 2.724100 53.844000 -1.661490 -1.734515 -1.814458 -2.132773 -2.942182 19.239909 1.106912e+01 4.226796e+00 2.172497e+00 1.752604e+00 1.491635e+00 1.496154e+00 1.139540e+00 7.388190e-01 8.090223e-01
max 29.168600 63.337100 -1.200247 -1.283895 -1.271788 -1.440369 -2.276760 218.944064 4.854796e+01 4.337788e+01 2.713946e+01 2.007353e+01 2.145210e+01 1.049357e+01 1.378756e+01 8.047028e+00 6.355754e+00
y.describe()
1 3 5 7 9 11 13 15 17 19 ... 955 960 965 970 975 980 985 990 995 1000
count 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 ... 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000 3181.000000
mean -2.900663 -2.898407 -2.895640 -2.891294 -2.886683 -2.883220 -2.880841 -2.879046 -2.878439 -2.878442 ... -3.591383 -3.591066 -3.589926 -3.588963 -3.587449 -3.589363 -3.591955 -3.596240 -3.599698 -3.602358
std 0.358795 0.361310 0.363517 0.365821 0.366232 0.366527 0.366901 0.366984 0.366933 0.365515 ... 0.183568 0.180024 0.180462 0.178927 0.179459 0.183057 0.183960 0.187008 0.189023 0.189302
min -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 ... -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000 -5.000000
25% -3.193820 -3.200659 -3.200659 -3.200659 -3.193820 -3.200659 -3.196231 -3.200659 -3.200659 -3.200659 ... -3.698970 -3.698970 -3.698970 -3.698970 -3.698970 -3.698970 -3.698970 -3.706795 -3.717453 -3.721246
50% -2.870632 -2.869666 -2.864867 -2.856985 -2.855426 -2.847712 -2.844664 -2.838632 -2.834813 -2.835647 ... -3.602060 -3.602060 -3.602060 -3.602060 -3.602060 -3.602060 -3.602060 -3.619789 -3.619789 -3.619789
75% -2.617983 -2.614394 -2.609065 -2.605548 -2.600326 -2.595166 -2.591760 -2.591760 -2.591760 -2.590913 ... -3.455932 -3.455932 -3.455932 -3.455932 -3.455932 -3.455932 -3.455932 -3.455932 -3.455932 -3.455932
max -1.937794 -1.917933 -1.845044 -1.831797 -1.862329 -1.843754 -1.843754 -1.843754 -1.843754 -1.877456 ... -1.730487 -2.202145 -2.158015 -2.232432 -2.402766 -2.311306 -2.311306 -2.311306 -2.311306 -2.311306

8 rows × 276 columns

# training and testing split
train_size = 0.8
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, random_state=random_state)

Scale Data

Method I - Manually

# Standardize Inputs (per dimension)
x_mean, x_std = xtrain.mean(axis=0), xtrain.std(axis=0)

xtrain_norm = (xtrain - x_mean) / x_std
xtest_norm = (xtest - x_mean) / x_std

# Normalize Outputs
y_mean = ytrain.mean(axis=0)

ytrain_norm = ytrain - y_mean
ytest_norm = ytest - y_mean

Method II - Scikit-Learn

# Normalize Inputs
x_normalizer = StandardScaler(with_mean=True, with_std=True)
xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)

# Normalize Outputs
y_normalizer = StandardScaler(with_std=False)
ytrain_norm = y_normalizer.fit_transform(xtrain)
ytest_norm = y_normalizer.transform(xtest)

Model I - Standard GPR

# define kernel function
kernel = ConstantKernel() * RBF() + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=True,         # good standard practice
    random_state=123,         # reproducibility
    n_restarts_optimizer=10,  # good practice (avoids local minima)
)


# train GP Model
t0 = time.time()
gp_model.fit(xtrain, ytrain)
t1 = time.time() - t0


print(
    f"Training Time: {t1:.3} seconds"
)
# Predictions
t0 = time.time()
ypred, ystd = gp_model.predict(xtest, return_std=True)
t1 = time.time() - t0


print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTesting Time: {t1:.3} seconds"
)

Model II - GPR w. PCA Transform

# define kernel function
kernel = ConstantKernel() * RBF(np.ones(xtrain.shape[1])) + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=True,         # good standard practice
    random_state=123,         # reproducibility
    n_restarts_optimizer=10,  # good practice (avoids local minima)
)

# Define target transformer
pca_model = PCA(n_components=10)

# Define Wrapper for target transformation
full_regressor = TransformedTargetRegressor(
    regressor=gp_model,
    transformer=pca_model,   # same number of components as informative
    check_inverse=False                 # PCA is not a direct inverse transformation

)

# train GP Model
t0 = time.time()
full_regressor.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 1152.199 seconds
# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 1.613
MSE: 14.251
RMSE: 3.775
R2: 0.409 
Time: 0.0616 seconds
# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred, multioutput='raw_values')
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')

# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')

# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')

# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')


plt.tight_layout()
plt.show()
fig, ax = plt.subplots()

plt.scatter(ypred, ytest_norm)
plt.show()

Model III - MultiTask GP w. PCA Transform

# define kernel function
kernel = ConstantKernel() * RBF(np.ones(xtrain.shape[1])) + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=True,         # good standard practice
    random_state=123,         # reproducibility
    n_restarts_optimizer=10,  # good practice (avoids local minima)
)

# Define Multioutput function
gp_model_multi = MultiOutputRegressor(
    gp_model, 
    n_jobs=-1,              # Number of cores to use to parallelize the training
)

# Define target transformer
pca_model = PCA(n_components=10)

# Define Wrapper for target transformation
full_regressor = TransformedTargetRegressor(
    regressor=gp_model_multi,
    transformer=pca_model,   # same number of components as informative
    check_inverse=False                 # PCA is not a direct inverse transformation

)


# Fit Model
t0 = time.time()
full_regressor.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0


print(
    f"Training Time: {t1:.3f} seconds"
)
Training Time: 2225.352 seconds
# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred)
mse = mean_squared_error(ytest_norm, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 3.463
MSE: 321.103
RMSE: 17.919
R2: 0.555 
Time: 2.4 seconds
# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred, multioutput='raw_values')
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')

# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')

# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')

# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')


plt.tight_layout()
plt.show()
fig, ax = plt.subplots()

plt.scatter(ypred, ytest_norm)
plt.show()





# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0


# Get Stats
mae = mean_absolute_error(ytest_norm, ypred)
mse = mean_squared_error(ytest_norm, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred)

print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
MAE: 5.000
MSE: 349.004
RMSE: 18.682
R2: 0.138 
Time: 1.44 seconds


ypred.shape, ytest.shape
((2181, 278), (2181, 278))
results_model = sm.OLS(ypred[:, 0], ytest[:, 0])
results = results_model.fit()
print(results.summary())
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-25-56c5bfebb168> in <module>
----> 1 results_model = sm.OLS(ypred[:, 0], ytest[:, 0])
      2 results = results_model.fit()
      3 print(results.summary())

/usr/local/miniconda3/envs/ml4ocn/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2925             if self.columns.nlevels > 1:
   2926                 return self._getitem_multilevel(key)
-> 2927             indexer = self.columns.get_loc(key)
   2928             if is_integer(indexer):
   2929                 indexer = [indexer]

/usr/local/miniconda3/envs/ml4ocn/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2655                                  'backfill or nearest lookups')
   2656             try:
-> 2657                 return self._engine.get_loc(key)
   2658             except KeyError:
   2659                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(slice(None, None, None), 0)' is an invalid key


mod = sm.OLS(spector_data.endog, spector_data.exog)

#load data
INPUT =
pd.read_csv("/home/sauzede/Documents/R/DATA/MACHINE_LEARNING/DATASET_TO_TRY/X_INPUT_LOG_PCA_NA.csv")
INPUT.head()

X = INPUT
del X['wmo']
del X['n_cycle']
print("INPUTS")
print(X.head())

OUTPUT =
pd.read_csv("/home/sauzede/Documents/R/DATA/MACHINE_LEARNING/DATASET_TO_TRY/Y_OUTPUT_LOG_NA.csv")
OUTPUT.head()

y = OUTPUT
del y['wmo']
del y['n_cycle']
print("OUTPUTS")
print(y.head())

# Training and Testing
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=2500,
random_state=123)

print(xtrain.shape)
print(ytest.shape)

ytest = np.array(ytest)

MEAN_X_TRAIN = xtrain.apply(np.mean,0)
#print(MEAN_X_TRAIN)
SD_X_TRAIN = xtrain.apply(np.std,0)
#print(SD_X_TRAIN)

xtrain = np.array(xtrain)
xtest = np.array(xtest)

#center and reduce the input data
#nb of columns
n_col = xtrain.shape[1]

xtrain_cr = np.copy(xtrain)
xtest_cr= np.copy(xtest)
for i in np.arange(0,n_col,1):
     xtrain_cr[:,i] = (xtrain[:,i] - MEAN_X_TRAIN[i]) / SD_X_TRAIN[i]
     xtest_cr[:,i] = (xtest[:,i] - MEAN_X_TRAIN[i]) / SD_X_TRAIN[i]


#Center the outputs (per column)
n_coly = ytrain.shape[1]
MEAN_Y_TRAIN = ytrain.apply(np.mean,0)

ytrain = np.array(ytrain)
ytrain_c = np.copy(ytrain)
for i in np.arange(0,n_coly,1):
     ytrain_c[:,i] = ytrain[:,i] - MEAN_Y_TRAIN[i]