Trial2 gps
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time as time
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
!ls /media/disk/erc/papers/2019_ML_OCN/data
data_path = '/media/disk/erc/papers/2019_ML_OCN/data/raph_temp_data_NA/'
# Import data
X = pd.read_csv(f"{data_path}X_INPUT_LOG_PCA_NA.csv")
X.describe()
X = X.iloc[:, 2:]
y = pd.read_csv(f"{data_path}Y_OUTPUT_LOG_NA.csv")
y = y.iloc[:, 2:]
y.describe()
# training and testing split
train_size = 0.8
random_state = 123
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, random_state=random_state)
Scale Data¶
Method I - Manually¶
# Standardize Inputs (per dimension)
x_mean, x_std = xtrain.mean(axis=0), xtrain.std(axis=0)
xtrain_norm = (xtrain - x_mean) / x_std
xtest_norm = (xtest - x_mean) / x_std
# Normalize Outputs
y_mean = ytrain.mean(axis=0)
ytrain_norm = ytrain - y_mean
ytest_norm = ytest - y_mean
Method II - Scikit-Learn¶
# Normalize Inputs
x_normalizer = StandardScaler(with_mean=True, with_std=True)
xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)
# Normalize Outputs
y_normalizer = StandardScaler(with_std=False)
ytrain_norm = y_normalizer.fit_transform(xtrain)
ytest_norm = y_normalizer.transform(xtest)
Model I - Standard GPR¶
# define kernel function
kernel = ConstantKernel() * RBF() + WhiteKernel()
# define GP model
gp_model = GaussianProcessRegressor(
kernel=kernel, # kernel function (very important)
normalize_y=True, # good standard practice
random_state=123, # reproducibility
n_restarts_optimizer=10, # good practice (avoids local minima)
)
# train GP Model
t0 = time.time()
gp_model.fit(xtrain, ytrain)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3} seconds"
)
# Predictions
t0 = time.time()
ypred, ystd = gp_model.predict(xtest, return_std=True)
t1 = time.time() - t0
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTesting Time: {t1:.3} seconds"
)
Model II - GPR w. PCA Transform¶
# define kernel function
kernel = ConstantKernel() * RBF(np.ones(xtrain.shape[1])) + WhiteKernel()
# define GP model
gp_model = GaussianProcessRegressor(
kernel=kernel, # kernel function (very important)
normalize_y=True, # good standard practice
random_state=123, # reproducibility
n_restarts_optimizer=10, # good practice (avoids local minima)
)
# Define target transformer
pca_model = PCA(n_components=10)
# Define Wrapper for target transformation
full_regressor = TransformedTargetRegressor(
regressor=gp_model,
transformer=pca_model, # same number of components as informative
check_inverse=False # PCA is not a direct inverse transformation
)
# train GP Model
t0 = time.time()
full_regressor.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred, multioutput='uniform_average')
mse = mean_squared_error(ytest_norm, ypred, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred, multioutput='raw_values')
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')
# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')
# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')
# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots()
plt.scatter(ypred, ytest_norm)
plt.show()
Model III - MultiTask GP w. PCA Transform¶
# define kernel function
kernel = ConstantKernel() * RBF(np.ones(xtrain.shape[1])) + WhiteKernel()
# define GP model
gp_model = GaussianProcessRegressor(
kernel=kernel, # kernel function (very important)
normalize_y=True, # good standard practice
random_state=123, # reproducibility
n_restarts_optimizer=10, # good practice (avoids local minima)
)
# Define Multioutput function
gp_model_multi = MultiOutputRegressor(
gp_model,
n_jobs=-1, # Number of cores to use to parallelize the training
)
# Define target transformer
pca_model = PCA(n_components=10)
# Define Wrapper for target transformation
full_regressor = TransformedTargetRegressor(
regressor=gp_model_multi,
transformer=pca_model, # same number of components as informative
check_inverse=False # PCA is not a direct inverse transformation
)
# Fit Model
t0 = time.time()
full_regressor.fit(xtrain_norm, ytrain_norm)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0
# Get Average Stats
mae = mean_absolute_error(ytest_norm, ypred)
mse = mean_squared_error(ytest_norm, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
# Get Stats per Depth
mae_raw = mean_absolute_error(ytest_norm, ypred, multioutput='raw_values')
mse_raw = mean_squared_error(ytest_norm, ypred, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ytest_norm, ypred, multioutput='raw_values')
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
# R2 Values
ax[0,0].plot(r2_raw)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')
# MAE
ax[0,1].plot(mae_raw)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')
# MSE
ax[1,0].plot(mse_raw)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')
# RMSE
ax[1,1].plot(rmse_raw)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots()
plt.scatter(ypred, ytest_norm)
plt.show()
# Predictions
t0 = time.time()
ypred = full_regressor.predict(xtest_norm)
t1 = time.time() - t0
# Get Stats
mae = mean_absolute_error(ytest_norm, ypred)
mse = mean_squared_error(ytest_norm, ypred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest_norm, ypred)
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
ypred.shape, ytest.shape
results_model = sm.OLS(ypred[:, 0], ytest[:, 0])
results = results_model.fit()
print(results.summary())
mod = sm.OLS(spector_data.endog, spector_data.exog)
#load data
INPUT =
pd.read_csv("/home/sauzede/Documents/R/DATA/MACHINE_LEARNING/DATASET_TO_TRY/X_INPUT_LOG_PCA_NA.csv")
INPUT.head()
X = INPUT
del X['wmo']
del X['n_cycle']
print("INPUTS")
print(X.head())
OUTPUT =
pd.read_csv("/home/sauzede/Documents/R/DATA/MACHINE_LEARNING/DATASET_TO_TRY/Y_OUTPUT_LOG_NA.csv")
OUTPUT.head()
y = OUTPUT
del y['wmo']
del y['n_cycle']
print("OUTPUTS")
print(y.head())
# Training and Testing
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=2500,
random_state=123)
print(xtrain.shape)
print(ytest.shape)
ytest = np.array(ytest)
MEAN_X_TRAIN = xtrain.apply(np.mean,0)
#print(MEAN_X_TRAIN)
SD_X_TRAIN = xtrain.apply(np.std,0)
#print(SD_X_TRAIN)
xtrain = np.array(xtrain)
xtest = np.array(xtest)
#center and reduce the input data
#nb of columns
n_col = xtrain.shape[1]
xtrain_cr = np.copy(xtrain)
xtest_cr= np.copy(xtest)
for i in np.arange(0,n_col,1):
xtrain_cr[:,i] = (xtrain[:,i] - MEAN_X_TRAIN[i]) / SD_X_TRAIN[i]
xtest_cr[:,i] = (xtest[:,i] - MEAN_X_TRAIN[i]) / SD_X_TRAIN[i]
#Center the outputs (per column)
n_coly = ytrain.shape[1]
MEAN_Y_TRAIN = ytrain.apply(np.mean,0)
ytrain = np.array(ytrain)
ytrain_c = np.copy(ytrain)
for i in np.arange(0,n_coly,1):
ytrain_c[:,i] = ytrain[:,i] - MEAN_Y_TRAIN[i]