Skip to content

Validation Figures

This notebook goes through and plots the validation figures. These are specific to the plots used in the conference paper.

Experiment Overview

Code

Packages

import sys
sys.path.insert(0, "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/src")

# Standard packages
import numpy as np
import pandas as pd

# Datasets
from data.make_dataset import DataLoader, load_standard_data, load_high_dim_data, load_labels, get_data

# Experiments

# Features
from features.pca_features import transform_all, transform_individual
from features.analysis import get_stats
from sklearn.preprocessing import StandardScaler
from data.make_dataset import ValidationFloats
from features.build_features import run_input_preprocess, run_input_postprocess, run_output_preprocess, run_output_postprocess, run_split

# ML Models
from sklearn.model_selection import train_test_split
from models.baseline import train_rf_model
import statsmodels.api as smi
from sklearn.metrics import r2_score


# Visualization
from visualization.visualize import plot_mo_stats, plot_geolocations, get_depth_labels
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
plt.style.use('seaborn-poster')

%load_ext autoreload
%autoreload 2
/home/emmanuel/.conda/envs/ml4ocn/lib/python3.6/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

Data

def run_experiment():

    # Load Data
    dataset = get_data(DataParams)
    print("Input:\n", dataset['ytrain'].min().min(), dataset['ytrain'].max().max())

    # Run Inputs Preprocessing
    dataset = run_input_preprocess(ProcessParams, dataset)

    # Run Outputs Preprocessing
    # Train Test Split
    dataset = run_split(ProcessParams, dataset)




    # Run Inputs PostProcessing
    if ProcessParams.input_std == 'after':
        dataset = run_input_postprocess(ProcessParams, dataset)

    # Run Outputs Post Processing
    dataset = run_output_postprocess(ProcessParams, dataset)

    print("Output:\n", dataset['ytrain'].min().min(), dataset['ytrain'].max().max())
    return dataset
REGION = 'stg' # 'na'
REGION_FLOAT = 3902121 # 6901486
MODEL = 'rf'

class DataParams:
    region = REGION

class ProcessParams:
    n_components = 5
    valid_split = 0.2
    input_std = "before"
    pca_seed = 123
    bootstrap_seed = 123
    std_ouputs = True

dataset = run_experiment()

region = DataParams.region

print('Training:')
print(dataset['Xtrain'].shape, dataset['ytrain'].shape)
print('Validation:')
print(dataset['Xvalid'].shape, dataset['yvalid'].shape)
print('Testing:')
print(dataset['Xtest'].shape, dataset['ytest'].shape)
Input:
 3.9999998989515e-05 0.0042500000687937
Output:
 -5.327020644761219 8.316019798346268
Training:
(1217, 25) (1217, 276)
Validation:
(136, 25) (136, 276)
Testing:
(133, 25) (133, 276)


2 - Train/Load ML Model

from models.baseline import (
    train_glm_model, 
    train_gp_model, 
    train_mlp_model, 
    train_lr_model, 
    train_mo_rf_model, 
    train_ridge_lr_model,
    train_stack_model
)
def save_model(model, save_name):
    # model path
    MODEL_PATH = "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/models/control/"

    # save model
    from joblib import dump
    dump(model, MODEL_PATH + save_name + '.joblib')
    return None

def load_model(save_name):
    # model path
    MODEL_PATH = "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/models/control/"

    # save model
    from joblib import load
    model = load(MODEL_PATH + save_name + '.joblib')
    return model

Model Options:

  • Linear Regression - lr
  • Random Forest - rf

5.1 - Ridge Regression

rf_params = {
    "n_estimators": 100,
    "criterion": "mse",
    "n_jobs": -1,
    "random_state": 123,
    "warm_start": False,
    "verbose": 1,
}
model_name = MODEL

# train model
if model_name == 'lr':
    model = train_ridge_lr_model(dataset['Xtrain'], dataset['ytrain'])
elif model_name == 'rf':
     model = train_rf_model(dataset['Xtrain'], dataset['ytrain'], rf_params)
else:
    raise ValueError

# save model
save_model(model, f"{model_name}_{DataParams.region}")
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.2s finished
Training time: 1.284 secs.


3 - Validation Floats

We have the following validation floats:

NA * 6901486 * 3902123

STG * 6901472 * 3902121

def get_scatter_validation(df, df_test, plot_config):

    # initialize class
    valid_getter = ValidationFloats(plot_config['region'])

    # get validation floats
    valid_getter.get_validation_floats(plot_config['region'])

    # get timeseries
    df = valid_getter.get_validation_res(df_test, df, validation_float=plot_config['float'])

    return df

3.1 - North Atlantic. (6901486) | STG (3902121)

SAVE_PATH = '/media/disk/erc/papers/2019_ML_OCN/figures/'
# make predictions
ypred = model.predict(dataset['Xtest'])

# inverse transform the outputs
ypred = dataset["out_post_trans"].inverse_transform(ypred)
ytest = dataset["out_post_trans"].inverse_transform(dataset['ytest'])
[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    0.1s finished

Average Validation Plots

def plot_validation(y_mu, y_std, ylabels):
    fig, ax = plt.subplots(figsize=(7,5))
    plt.plot(
        y_mu, ylabels, 
        color='black', linewidth=5, label='bBP$_1$, $\mu$')
    plt.fill_betweenx(
        ylabels, 
        y_mu - y_std, 
        y_mu + y_std, 
        interpolate=False, alpha=0.5, color='gray', label='bBP$_1$, $\sigma$'
    )
    return fig, ax
ytest_mu, ypred_mu = ytest.mean(axis=0), ypred.mean(axis=0)
ytest_std, ypred_std = ytest.std(axis=0), ypred.std(axis=0)
ylabels = get_depth_labels()

Labels

fig, ax = plot_validation(ytest_mu, ytest_std, ylabels)
ax.tick_params(axis="both", which="major", labelsize=20)
ax.tick_params(axis="both", which="minor", labelsize=20)
ax.ticklabel_format(style='sci',scilimits=(-3,4),axis='both')
plt.legend(fontsize=25, loc='lower right')
plt.tight_layout()
# fig.savefig(SAVE_PATH + f"stg_y1" + ".png", dpi=200, transparent=True)
plt.show()

Predictions

fig, ax = plot_validation(ypred_mu, ypred_std, ylabels)
ax.tick_params(axis="both", which="major", labelsize=20)
ax.tick_params(axis="both", which="minor", labelsize=20)
ax.ticklabel_format(style='sci',scilimits=(-3,4),axis='both')
plt.legend(fontsize=25, loc='lower right')
plt.tight_layout()
# fig.savefig(SAVE_PATH + f"stg_y1" + ".png", dpi=200, transparent=True)
plt.show()

Viz II - Profiles

# load validation floats
valid_getter = ValidationFloats(REGION)

# get validation floats
valid_getter.get_validation_floats(REGION)

# get timeseries
results_df = valid_getter.get_validation_res(ytest, ypred, validation_float=REGION_FLOAT)

results_df.describe()
(133, 2) (133, 276)
0.00011439676848409761 6901472.0 0.0001300000003539031 6901472.0
n_cycle Predictions Labels
count 6900.000000 6900.000000 6900.000000
mean 13.840000 0.000232 0.000260
std 7.449996 0.000108 0.000105
min 1.000000 0.000114 0.000130
25% 8.000000 0.000150 0.000180
50% 14.000000 0.000176 0.000200
75% 20.000000 0.000340 0.000369
max 26.000000 0.000486 0.000520
def df_2_xr(df):
    """Converts the data from a dataframe to an xarray (netcdf format)"""
    # create multiindex data
    df = df.set_index(['n_cycle', 'Depth'])

    # convert to xarray
    data = df.to_xarray()

    return data
# convert dataframe to xarray
results_xr = df_2_xr(results_df)
def plot_profiles(xr_data, plot_config):

    import matplotlib.colors as colors

    fig, ax = plt.subplots(figsize=(10,5))

    # plot colormesh
    xr_data.T.plot.pcolormesh(
        ax=ax, 
        # colorbar type
        cmap='jet', 
        # colorbar arguments
        cbar_kwargs={'label': ''}, 
        # log scale colorbar
        norm=colors.LogNorm(vmin=plot_config['vmin'], vmax=plot_config['vmax']), 

        # min,max
        vmin=plot_config['vmin'], 
        vmax=plot_config['vmax'], 

        # don't deal with outliers
        robust=False
    )
    ax.set_xlabel('')
    ax.set_ylabel('')

    plt.tight_layout()

    # save figure
    fig.savefig(SAVE_PATH + f"{plot_config['region']}_y_{plot_config['data']}_heatmap_{plot_config['float']}_pred_{plot_config['model']}")

    # show figure
    plt.show()
    return None

Original

# plot parameters
plot_config = dict()
plot_config['region'] = REGION
plot_config['model'] = MODEL

plot_config['float'] = REGION_FLOAT
plot_config['data'] = 'Labels'
plot_config['robust'] = False

# y_val_scat = get_scatter_validation(ypred_, ytest_, plot_config)
plot_config['vmin'] = np.minimum(results_xr.Predictions.min(), results_xr.Labels.min())
plot_config['vmax'] = np.maximum(results_xr.Predictions.max(), results_xr.Labels.max())

# plot profiles
plot_profiles(results_xr.Labels, plot_config)

Predicted

# plot parameters
plot_config = dict()
plot_config['region'] = REGION
plot_config['model'] = MODEL

plot_config['float'] = REGION_FLOAT
plot_config['data'] = 'Predictions'
plot_config['robust'] = False

# y_val_scat = get_scatter_validation(ypred_, ytest_, plot_config)
plot_config['vmin'] = np.minimum(results_xr.Predictions.min(), results_xr.Labels.min())
plot_config['vmax'] = np.maximum(results_xr.Predictions.max(), results_xr.Labels.max())

# plot profiles
plot_profiles(results_xr.Predictions, plot_config)

Viz III - Scatter Plot

plot_config = dict()

plot_config['region'] = REGION
plot_config['model'] = MODEL
plot_config['float'] = REGION_FLOAT

# =================
# Statistics
# =================

# R2 of log10 transform
plot_config['r2'] = r2_score(
    np.log10(results_df['Predictions']), 
    np.log10(results_df['Labels']))

# MAPD% of original data
plot_config['mapd'] = np.median(
    np.abs((results_df['Predictions']) - (results_df['Labels'])) / (results_df['Labels']))
# Linear Regression on log10 results
stat_mod = smi.OLS(np.log10(results_df['Labels']), np.log10(results_df['Predictions']))

lin_res = stat_mod.fit()

# extract coefficient
plot_config['slope'] = lin_res.params[0]
# Extract R2
r2_val = lin_res.rsquared
print(lin_res.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:                 Labels   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          4.282e+07
Date:                Mon, 27 Apr 2020   Prob (F-statistic):                        0.00
Time:                        16:32:48   Log-Likelihood:                          11464.
No. Observations:                6900   AIC:                                 -2.293e+04
Df Residuals:                    6899   BIC:                                 -2.292e+04
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Predictions     0.9834      0.000   6543.384      0.000       0.983       0.984
==============================================================================
Omnibus:                      104.899   Durbin-Watson:                   0.926
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               93.150
Skew:                          -0.233   Prob(JB):                     5.93e-21
Kurtosis:                       2.673   Cond. No.                         1.00
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Figure

from matplotlib.offsetbox import AnchoredText

# identity line
id_line = np.logspace(-4, -2, 100)

fig, ax = plt.subplots(figsize=(10,7))

# =================================
# Plot Data
# =================================

# scatter points
results_df.plot.scatter(ax=ax, x='Predictions', y='Labels', c='Depth', logx=True, logy=True, cmap='winter')

# identity line
ax.plot(id_line, id_line, linewidth=5, color='black')

# ====================
# results text
# ====================
at = AnchoredText(f"R$^2$: {plot_config['r2']:.3f}\nSlope: {plot_config['slope']:.3f}\nMAPD: {plot_config['mapd']:.2%}",
                  prop=dict(size=15, fontsize=20), frameon=True,
                  loc='upper left',

                  )
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
ax.add_artist(at)
ax.autoscale(enable=True, axis='both', tight=True)

# ==================
# Limites
# ==================

if REGION == 'na':
    ax.set_xlim(0.0001, 0.01)
    ax.set_ylim(0.0001, 0.01)
elif REGION == 'stg':
    ax.set_xlim(0.0001, 0.001)
    ax.set_ylim(0.0001, 0.001)
else:
    raise ValueError()
ax.set_xlabel('')
ax.set_ylabel('')
ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)

# extras
plt.tight_layout()

# save plot
fig.savefig(SAVE_PATH + f'{plot_config["region"]}_m{plot_config["model"]}_f{plot_config["float"]}_depth' + '.png')

# Show Plot
plt.show()