Algorithm Walk-Through¶
This notebook will walkthrough the preprocessing steps as well as the ML algorithm training procedure used for the multi-dimensional, multi-output data.
Experiment Overview¶
Code¶
Packages¶
import sys
sys.path.insert(0, "/media/disk/erc/papers/2019_ML_OCN/ml4ocean/src")
# Standard packages
import numpy as np
import pandas as pd
import xarray as xr
# Datasets
from data.make_dataset import DataLoader, load_standard_data, load_high_dim_data, load_labels, get_data
# Experiments
# Features
# from features.pca_features import transform_all, transform_individual
# from features.analysis import get_stats
# from sklearn.preprocessing import StandardScaler
# from data.make_dataset import ValidationFloats
# from features.build_features import run_input_preprocess, run_input_postprocess, run_output_preprocess, run_output_postprocess, run_split
# ML Models
import statsmodels.api as smi
from sklearn.metrics import r2_score
# Visualization
# from visualization.visualize import plot_mo_stats, plot_geolocations, get_depth_labels
# from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
plt.style.use('seaborn-poster')
%load_ext autoreload
%autoreload 2
6.3 - Validation Profile¶
def get_scatter_validation(df, df_test, plot_config):
# initialize class
valid_getter = ValidationFloats(plot_config['region'])
# get validation floats
valid_getter.get_validation_floats(plot_config['region'])
# get timeseries
df = valid_getter.get_validation_res(df_test, df, validation_float=plot_config['float'])
return df
NA
* 6901486
* 3902123
STG * 6901472 * 3902121
North Atlantic (6901486)¶
SAVE_PATH = '/media/disk/erc/papers/2019_ML_OCN/figures/'
results = pd.read_csv('na_results.csv', index_col=0)
results.head()
results.describe()
Profile¶
def df_2_xr(df):
"""Converts the data from a dataframe to an xarray (netcdf format)"""
# create multiindex data
df = df.set_index(['n_cycle', 'Depth'])
# convert to xarray
data = df.to_xarray()
return data
# Convert to xarray
results_xr = df_2_xr(results)
results_xr
def plot_profiles(xr_data, plot_config):
import matplotlib.colors as colors
fig, ax = plt.subplots(figsize=(10,5))
# plot colormesh
xr_data.T.plot.pcolormesh(
ax=ax,
# colorbar type
cmap='jet',
# colorbar arguments
cbar_kwargs={'label': ''},
# log scale colorbar
norm=colors.LogNorm(vmin=plot_config['vmin'], vmax=plot_config['vmax']),
# min,max
vmin=plot_config['vmin'],
vmax=plot_config['vmax'],
# don't deal with outliers
robust=False
)
ax.set_xlabel('')
ax.set_ylabel('')
plt.tight_layout()
# save figure
fig.savefig(SAVE_PATH + f"{plot_config['region']}_y_{plot_config['data']}_heatmap_{plot_config['float']}_pred_{plot_config['model']}")
# show figure
plt.show()
return None
# plot parameters
plot_config = dict()
plot_config['region'] = 'na'
plot_config['model'] = 'rf'
plot_config['float'] = 6901486
plot_config['data'] = 'Labels'
plot_config['robust'] = False
# y_val_scat = get_scatter_validation(ypred_, ytest_, plot_config)
plot_config['vmin'] = np.minimum(results_xr.Predictions.min(), results_xr.Labels.min())
plot_config['vmax'] = np.maximum(results_xr.Predictions.max(), results_xr.Labels.max())
# plot profiles
plot_profiles(results_xr.Labels, plot_config)
plot_config['vmin']
Scatter Plot¶
plot_config = dict()
plot_config['region'] = 'na'
plot_config['model'] = 'rf'
plot_config['float'] = 6901486
# =================
# Statistics
# =================
# R2 of log10 transform
plot_config['r2'] = r2_score(np.log10(results['Predictions']), np.log10(results['Labels']))
# MAPD% of original data
plot_config['mapd'] = np.median(np.abs((results['Predictions']) - (results['Labels'])) / (results['Labels']))
# Linear Regression on log10 results
stat_mod = smi.OLS(np.log10(results['Labels']), np.log10(results['Predictions']))
lin_res = stat_mod.fit()
r2_val = res.rsquared
print(res.summary())
# extract coefficient
plot_config['slope'] = res.params[0]
from matplotlib.offsetbox import AnchoredText
# identity line
id_line = np.logspace(-4, -2, 100)
fig, ax = plt.subplots(figsize=(10,7))
# =================================
# Plot Data
# =================================
# scatter points
results.plot.scatter(ax=ax, x='Predictions', y='Labels', c='Depth', logx=True, logy=True, cmap='winter')
# identity line
ax.plot(id_line, id_line, linewidth=5, color='black')
# ====================
# results text
# ====================
at = AnchoredText(f"R$^2$: {plot_config['r2']:.3f}\nSlope: {plot_config['slope']:.3f}\nMAPD: {plot_config['mapd']:.2%}",
prop=dict(size=15, fontsize=20), frameon=True,
loc='upper left',
)
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
ax.add_artist(at)
ax.autoscale(enable=True, axis='both', tight=True)
# ==================
# Limites
# ==================
ax.set_xlim(0.0001, 0.01)
ax.set_ylim(0.0001, 0.01)
ax.set_xlabel('')
ax.set_ylabel('')
ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)
# extras
plt.tight_layout()
# save plot
fig.savefig(SAVE_PATH + f'{plot_config["region"]}_m{plot_config["model"]}_f{plot_config["float"]}_depth' + '.png')
# Show Plot
plt.show()
SubTropical Gyre¶
SAVE_PATH = '/media/disk/erc/papers/2019_ML_OCN/figures/'
results = pd.read_csv('stg_results.csv', index_col=0)
results.head()
Profiles¶
results_xr = df_2_xr(results)
results_xr
# plot parameters
plot_config = dict()
plot_config['region'] = 'stg'
plot_config['model'] = 'rf'
plot_config['float'] = 3902121
plot_config['data'] = 'Predictions'
plot_config['robust'] = False
# y_val_scat = get_scatter_validation(ypred_, ytest_, plot_config)
plot_config['vmin'] = np.minimum(results_xr.Predictions.min(), results_xr.Labels.min())
plot_config['vmax'] = np.maximum(results_xr.Predictions.max(), results_xr.Labels.max())
# plot profiles
plot_profiles(results_xr.Predictions, plot_config)
Scatter Plot¶
plot_config = dict()
plot_config['region'] = 'stg'
plot_config['model'] = 'rf'
plot_config['float'] = 3902121
# =================
# Statistics
# =================
# R2 of log10 transform
plot_config['r2'] = r2_score(np.log10(results['Predictions']), np.log10(results['Labels']))
# MAPD% of original data
plot_config['mapd'] = np.median(np.abs((results['Predictions']) - (results['Labels'])) / (results['Labels']))
# Linear Regression on log10 results
stat_mod = smi.OLS(np.log10(results['Labels']), np.log10(results['Predictions']))
lin_res = stat_mod.fit()
r2_val = res.rsquared
print(res.summary())
# extract coefficient
plot_config['slope'] = res.params[0]
from matplotlib.offsetbox import AnchoredText
# identity line
id_line = np.logspace(-4, -2, 100)
fig, ax = plt.subplots(figsize=(10,7))
# =================================
# Plot Data
# =================================
# scatter points
results.plot.scatter(ax=ax, x='Predictions', y='Labels', c='Depth', logx=True, logy=True, cmap='winter')
# identity line
ax.plot(id_line, id_line, linewidth=5, color='black')
# ====================
# results text
# ====================
at = AnchoredText(f"R$^2$: {plot_config['r2']:.3f}\nSlope: {plot_config['slope']:.3f}\nMAPD: {plot_config['mapd']:.2%}",
prop=dict(size=15, fontsize=20), frameon=True,
loc='upper left',
)
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
ax.add_artist(at)
ax.autoscale(enable=True, axis='both', tight=True)
# ==================
# Limites
# ==================
ax.set_xlim(0.0001, 0.001)
ax.set_ylim(0.0001, 0.001)
ax.set_xlabel('')
ax.set_ylabel('')
ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=12)
# extras
plt.tight_layout()
# save plot
fig.savefig(SAVE_PATH + f'{plot_config["region"]}_m{plot_config["model"]}_f{plot_config["float"]}_depth' + '.png')
# Show Plot
plt.show()