Clustering Tutorial¶

In this notebook, we will look at how we can use clustering in order to see which ocean profiles we get from our outputs.

# standard libraries
import pandas as pd
import numpy as np
import xarray as xr

# ml libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import time as time
import statsmodels.api as sm

# plotting libraries
import seaborn as sns
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
plt.style.use('ggplot')

def plot_depth_errors(r2, mae, mse, rmse):
    # Plots
    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

    # R2 Values
    ax[0,0].plot(r2)
    ax[0,0].set_xlabel('Depths (Pressure)')
    ax[0,0].set_ylabel('R2')

    # MAE
    ax[0,1].plot(mae)
    ax[0,1].set_xlabel('Depths (Pressure)')
    ax[0,1].set_ylabel('MAE')

    # MSE
    ax[1,0].plot(mse)
    ax[1,0].set_xlabel('Depths (Pressure)')
    ax[1,0].set_ylabel('MSE')

    # RMSE
    ax[1,1].plot(rmse)
    ax[1,1].set_xlabel('Depths (Pressure)')
    ax[1,1].set_ylabel('RMSE')


    plt.tight_layout()
    plt.show()

Data¶

Inputs¶

The standard Inputs

data_path = '/media/disk/erc/papers/2019_ML_OCN/data/ana_temp_data_NA/'


X1 = pd.read_csv(f"{data_path}X_INPUT_NA.csv")

X_meta_cols = ['wmo', 'n_cycle', 'doy',]
X_index = X1[['lat', 'lon']]
X_meta = X1[X_meta_cols]
X1 = X1.drop(X_meta_cols, axis=1)


X_sal = pd.read_csv(f"{data_path}MATRIX_PSAL_NA.txt", sep=' ', header=None)
X_temp = pd.read_csv(f"{data_path}MATRIX_TEMP_NA.txt", sep=' ', header=None)
X_dens = pd.read_csv(f"{data_path}MATRIX_DENS_NA.txt", sep=' ', header=None)
X_spi = pd.read_csv(f"{data_path}MATRIX_SPICINESS_NA.txt", sep=' ', header=None)

X_sal.shape, X_temp.shape, X_dens.shape, X_spi.shape

# perform PCA on each of the datasets
n_components = 5
random_state = 123
pca_models = list()
X2 = list()

for iX in [X_sal, X_temp, X_dens, X_spi]:

    # initiate PCA
    pca_clf = PCA(n_components=n_components, random_state=random_state)

    # perform PCA
    pca_clf.fit(iX)

    # transform data
    X2.append(pca_clf.transform(iX))

    # save pca_model
    pca_models.append(pca_clf)


X2 = np.hstack(X2)

X = np.hstack([X1, X2])

Ouputs¶

The BBP Outputs

y = pd.read_csv(f"{data_path}BBP_OUTPUT_NA.csv")
y_meta_cols = ['wmo', 'n_cycle']
y_meta = y[y_meta_cols]
y = y.drop(y_meta_cols, axis=1)

ylog = np.log(y)

PCA Reduction¶

n_components = 100
random_state = 123

clf_pca = PCA(n_components=n_components, random_state=random_state)

ylog = clf_pca.fit_transform(ylog)

Standardize¶

# Normalize Inputs
x_normalizer = StandardScaler(with_mean=True, with_std=True)
x_norm = x_normalizer.fit_transform(X)

# Normalize Outputs
y_normalizer = MinMaxScaler()
y_norm = y_normalizer.fit_transform(ylog)

Clustering - Outputs¶

We're going to assume 3 clusters for 3 distinct profiles.

n_clusters   = 4
random_state = 123
n_jobs       = -1
max_iter     = 1000

# initialize KMeans model
k_model = KMeans(
    n_clusters=n_clusters, 
    n_jobs=n_jobs, 
    random_state=random_state,
    max_iter=max_iter
)

# fit_predict
y_pred = k_model.fit_predict(y_norm)

k_model.cluster_centers_.shape, k_model.labels_.shape

((4, 100), (3113,))

Plot Clusters¶

# cluster I
y_viz = x_norm[k_model.labels_ == 0]

fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()

# cluster II
y_viz = x_norm[k_model.labels_ == 1]

fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()

# cluster III
y_viz = x_norm[k_model.labels_ == 2]

fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()

# cluster III
y_viz = x_norm[k_model.labels_ == 3]

fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()

Map Clusters¶

X_index['clusters'] = y_pred

cluster_df = X_index.set_index(['lat', 'lon'])

cluster_xr = cluster_df.to_xarray()

# custom colormap
cmap = plt.cm.colors.LinearSegmentedColormap.from_list("", ["red","green","blue", ""], 3)

fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection=ccrs.PlateCarree())
cluster_xr.clusters.plot(ax=ax, cmap=cmap, cbar_kwargs={'label': 'Clusters'})
ax.coastlines()
plt.show()

Clustering - Inputs¶

n_clusters   = 3
random_state = 123
n_jobs       = -1
max_iter     = 1000

# initialize KMeans model
k_model = KMeans(
    n_clusters=n_clusters, 
    n_jobs=n_jobs, 
    random_state=random_state,
    max_iter=max_iter
)

# fit_predict
X_clus = k_model.fit_predict(x_norm)

k_model.cluster_centers_.shape, k_model.labels_.shape

((3, 30), (3113,))

X_index['clusters'] = X_clus

cluster_df = X_index.set_index(['lat', 'lon'])

cluster_xr = cluster_df.to_xarray()

# custom colormap
cmap = plt.cm.colors.LinearSegmentedColormap.from_list("", ["red","green","blue"], 3)

fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection=ccrs.PlateCarree())
cluster_xr.clusters.plot(ax=ax, cmap=cmap, cbar_kwargs={'label': 'Clusters'})
ax.coastlines()
plt.show()

Visualize Clusters¶

# cluster 1
y_viz = x_norm[k_model.labels_ == 0]

fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()

(1861, 30)

Training w/ Clusters¶

# get cluster centers

X_1 = x_norm[k_model.labels_ == 0]
y_1 = y_norm[k_model.labels_ == 0]

print(X.shape, y.shape, X_1.shape, y_1.shape)

# training and testing split
train_size = 0.7
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(X_1, y_1, train_size=train_size, random_state=random_state)

(3113, 30) (3113, 276) (1861, 30) (1861, 276)

ML - Random Forest Regressor¶

from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(
    n_estimators=1000, 
    criterion='mse',
    n_jobs=-1,
    random_state=123,
    warm_start=True,
    verbose=0
)


t0 = time.time()
rf_model.fit(xtrain, ytrain)
t1 = time.time() - t0

print(
    f"Training Time: {t1:.3f} seconds"
)

Training Time: 12.124 seconds

# Predictions
t0 = time.time()
ypred = rf_model.predict(xtest)
t1 = time.time() - t0

# Get Stats
mae = mean_absolute_error(ypred, ytest, multioutput='uniform_average')
mse = mean_squared_error(ypred, ytest, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred, ytest , multioutput='uniform_average')
print(
    f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
    f" \nTime: {t1:.3} seconds"
)
print(r2)


fig, ax = plt.subplots()
#check this
plt.scatter(ypred, ytest,alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()

MAE: 0.141
MSE: 0.085
RMSE: 0.291
R2: 0.270 
Time: 1.11 seconds
0.2697163881583298

# Get Stats
mae_raw = mean_absolute_error(ypred, ytest, multioutput='raw_values')
mse_raw = mean_squared_error(ypred, ytest, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred, ytest,multioutput='raw_values')

plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)