Demo I - Training ML Algorithms¶
# install specific version of pandas and progress bar just in case
!pip install tqdm pandas==1.0.3
import pathlib
import sys
# define the top level directory
PROJECT_PATH = pathlib.Path("/media/disk/erc/papers/2019_ML_OCN/")
CODE_PATH = PROJECT_PATH.joinpath("ml4ocean")
sys.path.append(str(CODE_PATH))
# ml4ocean packages
from src.utils import get_paths
from src.data.world import get_full_data, world_features
from src.features.world import subset_independent_floats
PATHS = get_paths()
# standard pacakges
import tqdm
import time
import numpy as np
import pandas as pd
from scipy import stats
# ML packages
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
# ML preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
# statistics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import statsmodels.api as sm
# plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['seaborn-paper'])
import warnings
warnings.simplefilter("ignore")
%load_ext autoreload
%autoreload 2
Data¶
inputs_df = pd.read_csv(f"{PATHS.data_interim.joinpath('inputs.csv')}")
outputs_df = pd.read_csv(f"{PATHS.data_interim.joinpath('outputs.csv')}")
# create meta index
inputs_df = inputs_df.set_index(world_features.meta)
outputs_df = outputs_df.set_index(world_features.meta)
Train-Test Split¶
train_size = 0.8
random_state = 42
xtrain, xtest, ytrain, ytest = train_test_split(
inputs_df,
outputs_df,
train_size=train_size,
random_state=random_state
)
print(xtrain.shape, xtest.shape)
print(ytrain.shape, ytest.shape)
Standardize the Inputs / Outputs¶
x_scaler = StandardScaler().fit(xtrain)
y_scaler = StandardScaler(with_std=False).fit(ytrain)
# scale inputs
xtrain_scaled = pd.DataFrame(x_scaler.transform(xtrain), columns=xtrain.columns, index=xtrain.index)
xtest_scaled = pd.DataFrame(x_scaler.transform(xtest), columns=xtest.columns, index=xtest.index)
# scale outputs
ytrain_scaled = pd.DataFrame(y_scaler.transform(ytrain), columns=ytrain.columns, index=ytrain.index)
ytest_scaled = pd.DataFrame(y_scaler.transform(ytest), columns=ytest.columns, index=ytest.index)
sns.distplot(xtrain_scaled.iloc[:, 14], label='X')
sns.distplot(ytrain_scaled.iloc[:, 10], label='Y')
plt.legend(fontsize=20)
ML Algorithms¶
Model - Random Forest Regressor¶
model = RandomForestRegressor(
n_estimators=2_000,
random_state=42,
n_jobs=-1,
verbose=1,
warm_start=True,
criterion='mse'
)
t0 = time.time()
model.fit(xtrain_scaled, ytrain_scaled)
t1 = time.time() - t0
print(f"Time Taken: {t1} secs")
Viz I - Training Data¶
ypred_train = model.predict(xtrain_scaled)
ypred_train = pd.DataFrame(ytrain_scaled, columns=ytrain_scaled.columns, index=ytrain_scaled.index)
np.minimum(y)
from src.visualization.world import plot_residuals
fig, ax = plot_residuals(ypred_train.values.ravel(), ytrain_scaled.values.ravel(), 'train')
Viz II - Testing Data¶
# Apply MLP to the test dataset
ypred_scaled = model.predict(xtest_scaled)
# Detransform the outputs:
ypred = pd.DataFrame(
y_scaler.inverse_transform(ypred_scaled),
columns=ytest_scaled.columns,
index=ytest_scaled.index
)
ytest_c = pd.DataFrame(
y_scaler.inverse_transform(ytest_scaled),
columns=ytest_scaled.columns,
index=ytest_scaled.index
)
# make vectors
ypred_vec = ypred.values.ravel()
ytest_vec = ytest_c.values.ravel()
Statistics
def median_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.median(np.abs((y_true - y_pred) / y_true)) * 100
mapd = median_absolute_percentage_error(pow(10, ytest_vec), pow(10, ypred_vec))
mse = mean_squared_error(pow(10, ytest_vec), pow(10, ypred_vec))
rmse = np.sqrt(mse)
r2 = r2_score(ytest_vec, ypred_vec)
slope_0, *_ = stats.linregress(ytest_vec, ypred_vec)
print(f"MAPD: {mapd:.2f}%")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")
print(f"Slope: {slope_0:.4f}")
plot_residuals(ytest_vec, ypred_vec, 'test')
Cross Validation¶
Method I - From Scratch¶
def standardize_data(Xtrain, Xtest, ytrain, ytest):
x_scaler = StandardScaler().fit(Xtrain)
y_scaler = StandardScaler(with_std=False).fit(ytrain)
# scale inputs
Xtrain_scaled = x_scaler.transform(Xtrain)
Xtest_scaled = x_scaler.transform(Xtest)
# scale outputs
ytrain_scaled = y_scaler.transform(ytrain)
ytest_scaled = y_scaler.transform(ytest)
scalers = {'x': x_scaler, 'y': y_scaler}
return Xtrain_scaled, Xtest_scaled, ytrain_scaled, ytest_scaled, scalers
# define the model
model = RandomForestRegressor(
n_estimators=2_000,
random_state=42,
n_jobs=-1,
verbose=0,
warm_start=False,
criterion='mse'
)
# define the split
n_splits = 10
kf = KFold(n_splits=n_splits)
predictions_test = list()
predictions_train = list()
for train_index, test_index in tqdm.tqdm(kf.split(inputs_df.values), total=n_splits):
# segment data
Xtr, Xte, ytr, yte, scalers = standardize_data(
inputs_df.values[train_index, :],
inputs_df.values[test_index, :],
outputs_df.values[train_index, :],
outputs_df.values[test_index, :],
)
# train model
model.fit(Xtr, ytr)
# predictions
predictions_train.append((model.predict(Xtr), ytr, scalers))
predictions_test.append((model.predict(Xte), yte, scalers))
[r2_score(iscale['y'].inverse_transform(x).ravel(), iscale['y'].inverse_transform(y).ravel()) for (y, x, iscale) in predictions_train]
[r2_score(iscale['y'].inverse_transform(x).ravel(), iscale['y'].inverse_transform(y).ravel()) for (y, x, iscale) in predictions_test]