Trial1 gps

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time as time
import statsmodels.api as sm

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

!ls /media/disk/erc/papers/2019_ML_OCN/data

/usr/bin/sh: module: line 1: syntax error: unexpected end of file
/usr/bin/sh: error importing function definition for `BASH_FUNC_module'
/usr/bin/sh: ml: line 1: syntax error: unexpected end of file
/usr/bin/sh: error importing function definition for `BASH_FUNC_ml'
raph_temp_data_NA

data_path = '/media/disk/erc/papers/2019_ML_OCN/data/raph_temp_data_NA/'

# Import data
X = pd.read_csv(f"{data_path}X_INPUT_LOG_PCA_NA.csv").iloc[:, 2:]
y = pd.read_csv(f"{data_path}Y_OUTPUT_LOG_NA.csv").iloc[:, 2:]

# training and testing split
train_size = 0.8
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(
    X, y, train_size=train_size, random_state=random_state
)

Scale Data¶

Method I - Manually¶

# Standardize Inputs (per dimension)
x_mean, x_std = xtrain.mean(axis=0), xtrain.std(axis=0)

xtrain_norm = (xtrain - x_mean) / x_std
xtest_norm = (xtest - x_mean) / x_std

# Normalize Outputs
y_mean = ytrain.mean(axis=0)

ytrain_norm = ytrain - y_mean
ytest_norm = ytest - y_mean

Method II - Scikit-Learn¶

# # Normalize Inputs
# x_normalizer = StandardScaler(with_mean=True, with_std=True)
# xtrain_norm = x_normalizer.fit_transform(xtrain)
# xtest_norm = x_normalizer.transform(xtest)

# # Normalize Outputs
# y_normalizer = StandardScaler(with_std=False)
# ytrain_norm = y_normalizer.fit_transform(xtrain)
# ytest_norm = y_normalizer.transform(xtest)

Model I - Standard GPR¶

# define kernel function
kernel = ConstantKernel() * RBF() + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=True,         # good standard practice
    random_state=123,         # reproducibility
    n_restarts_optimizer=10,  # good practice (avoids local minima)
)


# train GP Model
t0 = time.time()
gp_model.fit(xtrain, ytrain)
t1 = time.time() - t0


print(
    f"Training Time: {t1:.3} seconds"
)

# Predictions
t0 = time.time()
ypred, ystd = gp_model.predict(xtest, return_std=True)
t1 = time.time() - t0


print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTesting Time: {t1:.3} seconds"
)

Model II - GPR w. PCA Transform¶

# define kernel function
kernel = ConstantKernel() * RBF(np.ones(xtrain.shape[1])) + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=True,         # good standard practice
    random_state=123,         # reproducibility
    n_restarts_optimizer=5,  # good practice (avoids local minima)
)

# Define target transformer
pca_model = PCA(n_components=10)

# Define Wrapper for target transformation
full_regressor = TransformedTargetRegressor(
    regressor=gp_model,
    transformer=pca_model,   # same number of components as informative
    check_inverse=False                 # PCA is not a direct inverse transformation

)

# train GP Model
t0 = time.time()
full_regressor.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)

# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.068
MSE: 0.012
RMSE: 0.109
R2: 0.688 
Time: 0.0609 seconds

# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred, multioutput='raw_values')

# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')

# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')

# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')

# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')


plt.tight_layout()
plt.show()

fig, ax = plt.subplots()

plt.scatter(ypred, ytest_norm)
plt.show()

Model III - MultiTask GP w. PCA Transform¶

# define kernel function
kernel = ConstantKernel() * RBF(np.ones(xtrain.shape[1])) + WhiteKernel()

# define GP model
gp_model = GaussianProcessRegressor(
    kernel=kernel,            # kernel function (very important)
    normalize_y=True,         # good standard practice
    random_state=123,         # reproducibility
    n_restarts_optimizer=10,  # good practice (avoids local minima)
)

# Define Multioutput function
gp_model_multi = MultiOutputRegressor(
    gp_model, 
    n_jobs=-1,              # Number of cores to use to parallelize the training
)

# Define target transformer
pca_model = PCA(n_components=10)

# Define Wrapper for target transformation
full_regressor = TransformedTargetRegressor(
    regressor=gp_model_multi,
    transformer=pca_model,   # same number of components as informative
    check_inverse=False                 # PCA is not a direct inverse transformation

)


# Fit Model
t0 = time.time()
full_regressor.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0


print(
    f"Training Time: {t1:.3f} seconds"
)

Training Time: 1557.152 seconds

# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0


# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred)
mse = mean_squared_error(ytest_norm, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred, multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 0.069
MSE: 0.012
RMSE: 0.109
R2: 0.689 
Time: 1.77 seconds

# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred, multioutput='raw_values')

# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')

# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')

# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')

# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')


plt.tight_layout()
plt.show()

fig, ax = plt.subplots()

plt.scatter(ypred, ytest_norm)
plt.show()

# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0


# Get Stats
mae = mean_absolute_error(ytest_norm, ypred)
mse = mean_squared_error(ytest_norm, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred)

print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)

MAE: 5.000
MSE: 349.004
RMSE: 18.682
R2: 0.138 
Time: 1.44 seconds

ypred.shape, ytest.shape

((2181, 278), (2181, 278))

results_model = sm.OLS(ypred[:, 0], ytest[:, 0])
results = results_model.fit()
print(results.summary())

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-25-56c5bfebb168> in <module>
----> 1 results_model = sm.OLS(ypred[:, 0], ytest[:, 0])
      2 results = results_model.fit()
      3 print(results.summary())

/usr/local/miniconda3/envs/ml4ocn/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2925             if self.columns.nlevels > 1:
   2926                 return self._getitem_multilevel(key)
-> 2927             indexer = self.columns.get_loc(key)
   2928             if is_integer(indexer):
   2929                 indexer = [indexer]

/usr/local/miniconda3/envs/ml4ocn/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2655                                  'backfill or nearest lookups')
   2656             try:
-> 2657                 return self._engine.get_loc(key)
   2658             except KeyError:
   2659                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(slice(None, None, None), 0)' is an invalid key

mod = sm.OLS(spector_data.endog, spector_data.exog)

#load data
INPUT =
pd.read_csv("/home/sauzede/Documents/R/DATA/MACHINE_LEARNING/DATASET_TO_TRY/X_INPUT_LOG_PCA_NA.csv")
INPUT.head()

X = INPUT
del X['wmo']
del X['n_cycle']
print("INPUTS")
print(X.head())

OUTPUT =
pd.read_csv("/home/sauzede/Documents/R/DATA/MACHINE_LEARNING/DATASET_TO_TRY/Y_OUTPUT_LOG_NA.csv")
OUTPUT.head()

y = OUTPUT
del y['wmo']
del y['n_cycle']
print("OUTPUTS")
print(y.head())

# Training and Testing
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=2500,
random_state=123)

print(xtrain.shape)
print(ytest.shape)

ytest = np.array(ytest)

MEAN_X_TRAIN = xtrain.apply(np.mean,0)
#print(MEAN_X_TRAIN)
SD_X_TRAIN = xtrain.apply(np.std,0)
#print(SD_X_TRAIN)

xtrain = np.array(xtrain)
xtest = np.array(xtest)

#center and reduce the input data
#nb of columns
n_col = xtrain.shape[1]

xtrain_cr = np.copy(xtrain)
xtest_cr= np.copy(xtest)
for i in np.arange(0,n_col,1):
     xtrain_cr[:,i] = (xtrain[:,i] - MEAN_X_TRAIN[i]) / SD_X_TRAIN[i]
     xtest_cr[:,i] = (xtest[:,i] - MEAN_X_TRAIN[i]) / SD_X_TRAIN[i]


#Center the outputs (per column)
n_coly = ytrain.shape[1]
MEAN_Y_TRAIN = ytrain.apply(np.mean,0)

ytrain = np.array(ytrain)
ytrain_c = np.copy(ytrain)
for i in np.arange(0,n_coly,1):
     ytrain_c[:,i] = ytrain[:,i] - MEAN_Y_TRAIN[i]