Validation Figures¶
This notebook goes through and plots the validation figures. These are specific to the plots used in the conference paper.
Experiment Overview¶
Code¶
Packages¶
import sys
sys.path.insert(0, "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/src")
# Standard packages
import numpy as np
import pandas as pd
# Datasets
from data.make_dataset import DataLoader, load_standard_data, load_high_dim_data, load_labels, get_data
# Experiments
# Features
from features.pca_features import transform_all, transform_individual
from features.analysis import get_stats
from sklearn.preprocessing import StandardScaler
from data.make_dataset import ValidationFloats
from features.build_features import run_input_preprocess, run_input_postprocess, run_output_preprocess, run_output_postprocess, run_split
# ML Models
from sklearn.model_selection import train_test_split
from models.baseline import train_rf_model
import statsmodels.api as smi
from sklearn.metrics import r2_score
# Visualization
from visualization.visualize import plot_mo_stats, plot_geolocations, get_depth_labels
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
plt.style.use('seaborn-poster')
%load_ext autoreload
%autoreload 2
Data¶
def run_experiment():
# Load Data
dataset = get_data(DataParams)
print("Input:\n", dataset['ytrain'].min().min(), dataset['ytrain'].max().max())
# Run Inputs Preprocessing
dataset = run_input_preprocess(ProcessParams, dataset)
# Run Outputs Preprocessing
# Train Test Split
dataset = run_split(ProcessParams, dataset)
# Run Inputs PostProcessing
if ProcessParams.input_std == 'after':
dataset = run_input_postprocess(ProcessParams, dataset)
# Run Outputs Post Processing
dataset = run_output_postprocess(ProcessParams, dataset)
print("Output:\n", dataset['ytrain'].min().min(), dataset['ytrain'].max().max())
return dataset
REGION = 'stg' # 'na'
REGION_FLOAT = 3902121 # 6901486
MODEL = 'rf'
class DataParams:
region = REGION
class ProcessParams:
n_components = 5
valid_split = 0.2
input_std = "before"
pca_seed = 123
bootstrap_seed = 123
std_ouputs = True
dataset = run_experiment()
region = DataParams.region
print('Training:')
print(dataset['Xtrain'].shape, dataset['ytrain'].shape)
print('Validation:')
print(dataset['Xvalid'].shape, dataset['yvalid'].shape)
print('Testing:')
print(dataset['Xtest'].shape, dataset['ytest'].shape)
2 - Train/Load ML Model¶
from models.baseline import (
train_glm_model,
train_gp_model,
train_mlp_model,
train_lr_model,
train_mo_rf_model,
train_ridge_lr_model,
train_stack_model
)
def save_model(model, save_name):
# model path
MODEL_PATH = "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/models/control/"
# save model
from joblib import dump
dump(model, MODEL_PATH + save_name + '.joblib')
return None
def load_model(save_name):
# model path
MODEL_PATH = "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/models/control/"
# save model
from joblib import load
model = load(MODEL_PATH + save_name + '.joblib')
return model
Model Options:
- Linear Regression -
lr
- Random Forest -
rf
5.1 - Ridge Regression¶
rf_params = {
"n_estimators": 100,
"criterion": "mse",
"n_jobs": -1,
"random_state": 123,
"warm_start": False,
"verbose": 1,
}
model_name = MODEL
# train model
if model_name == 'lr':
model = train_ridge_lr_model(dataset['Xtrain'], dataset['ytrain'])
elif model_name == 'rf':
model = train_rf_model(dataset['Xtrain'], dataset['ytrain'], rf_params)
else:
raise ValueError
# save model
save_model(model, f"{model_name}_{DataParams.region}")
3 - Validation Floats¶
We have the following validation floats:
NA * 6901486 * 3902123
STG * 6901472 * 3902121
def get_scatter_validation(df, df_test, plot_config):
# initialize class
valid_getter = ValidationFloats(plot_config['region'])
# get validation floats
valid_getter.get_validation_floats(plot_config['region'])
# get timeseries
df = valid_getter.get_validation_res(df_test, df, validation_float=plot_config['float'])
return df
3.1 - North Atlantic. (6901486) | STG (3902121)¶
SAVE_PATH = '/media/disk/erc/papers/2019_ML_OCN/figures/'
# make predictions
ypred = model.predict(dataset['Xtest'])
# inverse transform the outputs
ypred = dataset["out_post_trans"].inverse_transform(ypred)
ytest = dataset["out_post_trans"].inverse_transform(dataset['ytest'])
Average Validation Plots¶
def plot_validation(y_mu, y_std, ylabels):
fig, ax = plt.subplots(figsize=(7,5))
plt.plot(
y_mu, ylabels,
color='black', linewidth=5, label='bBP$_1$, $\mu$')
plt.fill_betweenx(
ylabels,
y_mu - y_std,
y_mu + y_std,
interpolate=False, alpha=0.5, color='gray', label='bBP$_1$, $\sigma$'
)
return fig, ax
ytest_mu, ypred_mu = ytest.mean(axis=0), ypred.mean(axis=0)
ytest_std, ypred_std = ytest.std(axis=0), ypred.std(axis=0)
ylabels = get_depth_labels()
Labels¶
fig, ax = plot_validation(ytest_mu, ytest_std, ylabels)
ax.tick_params(axis="both", which="major", labelsize=20)
ax.tick_params(axis="both", which="minor", labelsize=20)
ax.ticklabel_format(style='sci',scilimits=(-3,4),axis='both')
plt.legend(fontsize=25, loc='lower right')
plt.tight_layout()
# fig.savefig(SAVE_PATH + f"stg_y1" + ".png", dpi=200, transparent=True)
plt.show()
Predictions¶
fig, ax = plot_validation(ypred_mu, ypred_std, ylabels)
ax.tick_params(axis="both", which="major", labelsize=20)
ax.tick_params(axis="both", which="minor", labelsize=20)
ax.ticklabel_format(style='sci',scilimits=(-3,4),axis='both')
plt.legend(fontsize=25, loc='lower right')
plt.tight_layout()
# fig.savefig(SAVE_PATH + f"stg_y1" + ".png", dpi=200, transparent=True)
plt.show()
Viz II - Profiles¶
# load validation floats
valid_getter = ValidationFloats(REGION)
# get validation floats
valid_getter.get_validation_floats(REGION)
# get timeseries
results_df = valid_getter.get_validation_res(ytest, ypred, validation_float=REGION_FLOAT)
results_df.describe()
def df_2_xr(df):
"""Converts the data from a dataframe to an xarray (netcdf format)"""
# create multiindex data
df = df.set_index(['n_cycle', 'Depth'])
# convert to xarray
data = df.to_xarray()
return data
# convert dataframe to xarray
results_xr = df_2_xr(results_df)
def plot_profiles(xr_data, plot_config):
import matplotlib.colors as colors
fig, ax = plt.subplots(figsize=(10,5))
# plot colormesh
xr_data.T.plot.pcolormesh(
ax=ax,
# colorbar type
cmap='jet',
# colorbar arguments
cbar_kwargs={'label': ''},
# log scale colorbar
norm=colors.LogNorm(vmin=plot_config['vmin'], vmax=plot_config['vmax']),
# min,max
vmin=plot_config['vmin'],
vmax=plot_config['vmax'],
# don't deal with outliers
robust=False
)
ax.set_xlabel('')
ax.set_ylabel('')
plt.tight_layout()
# save figure
fig.savefig(SAVE_PATH + f"{plot_config['region']}_y_{plot_config['data']}_heatmap_{plot_config['float']}_pred_{plot_config['model']}")
# show figure
plt.show()
return None
Original¶
# plot parameters
plot_config = dict()
plot_config['region'] = REGION
plot_config['model'] = MODEL
plot_config['float'] = REGION_FLOAT
plot_config['data'] = 'Labels'
plot_config['robust'] = False
# y_val_scat = get_scatter_validation(ypred_, ytest_, plot_config)
plot_config['vmin'] = np.minimum(results_xr.Predictions.min(), results_xr.Labels.min())
plot_config['vmax'] = np.maximum(results_xr.Predictions.max(), results_xr.Labels.max())
# plot profiles
plot_profiles(results_xr.Labels, plot_config)
Predicted¶
# plot parameters
plot_config = dict()
plot_config['region'] = REGION
plot_config['model'] = MODEL
plot_config['float'] = REGION_FLOAT
plot_config['data'] = 'Predictions'
plot_config['robust'] = False
# y_val_scat = get_scatter_validation(ypred_, ytest_, plot_config)
plot_config['vmin'] = np.minimum(results_xr.Predictions.min(), results_xr.Labels.min())
plot_config['vmax'] = np.maximum(results_xr.Predictions.max(), results_xr.Labels.max())
# plot profiles
plot_profiles(results_xr.Predictions, plot_config)
Viz III - Scatter Plot¶
plot_config = dict()
plot_config['region'] = REGION
plot_config['model'] = MODEL
plot_config['float'] = REGION_FLOAT
# =================
# Statistics
# =================
# R2 of log10 transform
plot_config['r2'] = r2_score(
np.log10(results_df['Predictions']),
np.log10(results_df['Labels']))
# MAPD% of original data
plot_config['mapd'] = np.median(
np.abs((results_df['Predictions']) - (results_df['Labels'])) / (results_df['Labels']))
# Linear Regression on log10 results
stat_mod = smi.OLS(np.log10(results_df['Labels']), np.log10(results_df['Predictions']))
lin_res = stat_mod.fit()
# extract coefficient
plot_config['slope'] = lin_res.params[0]
# Extract R2
r2_val = lin_res.rsquared
print(lin_res.summary())
Figure¶
from matplotlib.offsetbox import AnchoredText
# identity line
id_line = np.logspace(-4, -2, 100)
fig, ax = plt.subplots(figsize=(10,7))
# =================================
# Plot Data
# =================================
# scatter points
results_df.plot.scatter(ax=ax, x='Predictions', y='Labels', c='Depth', logx=True, logy=True, cmap='winter')
# identity line
ax.plot(id_line, id_line, linewidth=5, color='black')
# ====================
# results text
# ====================
at = AnchoredText(f"R$^2$: {plot_config['r2']:.3f}\nSlope: {plot_config['slope']:.3f}\nMAPD: {plot_config['mapd']:.2%}",
prop=dict(size=15, fontsize=20), frameon=True,
loc='upper left',
)
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
ax.add_artist(at)
ax.autoscale(enable=True, axis='both', tight=True)
# ==================
# Limites
# ==================
if REGION == 'na':
ax.set_xlim(0.0001, 0.01)
ax.set_ylim(0.0001, 0.01)
elif REGION == 'stg':
ax.set_xlim(0.0001, 0.001)
ax.set_ylim(0.0001, 0.001)
else:
raise ValueError()
ax.set_xlabel('')
ax.set_ylabel('')
ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)
# extras
plt.tight_layout()
# save plot
fig.savefig(SAVE_PATH + f'{plot_config["region"]}_m{plot_config["model"]}_f{plot_config["float"]}_depth' + '.png')
# Show Plot
plt.show()