Clustering Tutorial¶
In this notebook, we will look at how we can use clustering in order to see which ocean profiles we get from our outputs.
# standard libraries
import pandas as pd
import numpy as np
import xarray as xr
# ml libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import time as time
import statsmodels.api as sm
# plotting libraries
import seaborn as sns
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
plt.style.use('ggplot')
def plot_depth_errors(r2, mae, mse, rmse):
# Plots
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))
# R2 Values
ax[0,0].plot(r2)
ax[0,0].set_xlabel('Depths (Pressure)')
ax[0,0].set_ylabel('R2')
# MAE
ax[0,1].plot(mae)
ax[0,1].set_xlabel('Depths (Pressure)')
ax[0,1].set_ylabel('MAE')
# MSE
ax[1,0].plot(mse)
ax[1,0].set_xlabel('Depths (Pressure)')
ax[1,0].set_ylabel('MSE')
# RMSE
ax[1,1].plot(rmse)
ax[1,1].set_xlabel('Depths (Pressure)')
ax[1,1].set_ylabel('RMSE')
plt.tight_layout()
plt.show()
Data¶
Inputs¶
The standard Inputs
data_path = '/media/disk/erc/papers/2019_ML_OCN/data/ana_temp_data_NA/'
X1 = pd.read_csv(f"{data_path}X_INPUT_NA.csv")
X_meta_cols = ['wmo', 'n_cycle', 'doy',]
X_index = X1[['lat', 'lon']]
X_meta = X1[X_meta_cols]
X1 = X1.drop(X_meta_cols, axis=1)
X_sal = pd.read_csv(f"{data_path}MATRIX_PSAL_NA.txt", sep=' ', header=None)
X_temp = pd.read_csv(f"{data_path}MATRIX_TEMP_NA.txt", sep=' ', header=None)
X_dens = pd.read_csv(f"{data_path}MATRIX_DENS_NA.txt", sep=' ', header=None)
X_spi = pd.read_csv(f"{data_path}MATRIX_SPICINESS_NA.txt", sep=' ', header=None)
X_sal.shape, X_temp.shape, X_dens.shape, X_spi.shape
# perform PCA on each of the datasets
n_components = 5
random_state = 123
pca_models = list()
X2 = list()
for iX in [X_sal, X_temp, X_dens, X_spi]:
# initiate PCA
pca_clf = PCA(n_components=n_components, random_state=random_state)
# perform PCA
pca_clf.fit(iX)
# transform data
X2.append(pca_clf.transform(iX))
# save pca_model
pca_models.append(pca_clf)
X2 = np.hstack(X2)
X = np.hstack([X1, X2])
Ouputs¶
The BBP Outputs
y = pd.read_csv(f"{data_path}BBP_OUTPUT_NA.csv")
y_meta_cols = ['wmo', 'n_cycle']
y_meta = y[y_meta_cols]
y = y.drop(y_meta_cols, axis=1)
ylog = np.log(y)
PCA Reduction¶
n_components = 100
random_state = 123
clf_pca = PCA(n_components=n_components, random_state=random_state)
ylog = clf_pca.fit_transform(ylog)
Standardize¶
# Normalize Inputs
x_normalizer = StandardScaler(with_mean=True, with_std=True)
x_norm = x_normalizer.fit_transform(X)
# Normalize Outputs
y_normalizer = MinMaxScaler()
y_norm = y_normalizer.fit_transform(ylog)
Clustering - Outputs¶
We're going to assume 3 clusters for 3 distinct profiles.
n_clusters = 4
random_state = 123
n_jobs = -1
max_iter = 1000
# initialize KMeans model
k_model = KMeans(
n_clusters=n_clusters,
n_jobs=n_jobs,
random_state=random_state,
max_iter=max_iter
)
# fit_predict
y_pred = k_model.fit_predict(y_norm)
k_model.cluster_centers_.shape, k_model.labels_.shape
Plot Clusters¶
# cluster I
y_viz = x_norm[k_model.labels_ == 0]
fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()
# cluster II
y_viz = x_norm[k_model.labels_ == 1]
fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()
# cluster III
y_viz = x_norm[k_model.labels_ == 2]
fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()
# cluster III
y_viz = x_norm[k_model.labels_ == 3]
fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()
Map Clusters¶
X_index['clusters'] = y_pred
cluster_df = X_index.set_index(['lat', 'lon'])
cluster_xr = cluster_df.to_xarray()
# custom colormap
cmap = plt.cm.colors.LinearSegmentedColormap.from_list("", ["red","green","blue", ""], 3)
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection=ccrs.PlateCarree())
cluster_xr.clusters.plot(ax=ax, cmap=cmap, cbar_kwargs={'label': 'Clusters'})
ax.coastlines()
plt.show()
Clustering - Inputs¶
n_clusters = 3
random_state = 123
n_jobs = -1
max_iter = 1000
# initialize KMeans model
k_model = KMeans(
n_clusters=n_clusters,
n_jobs=n_jobs,
random_state=random_state,
max_iter=max_iter
)
# fit_predict
X_clus = k_model.fit_predict(x_norm)
k_model.cluster_centers_.shape, k_model.labels_.shape
X_index['clusters'] = X_clus
cluster_df = X_index.set_index(['lat', 'lon'])
cluster_xr = cluster_df.to_xarray()
# custom colormap
cmap = plt.cm.colors.LinearSegmentedColormap.from_list("", ["red","green","blue"], 3)
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(111, projection=ccrs.PlateCarree())
cluster_xr.clusters.plot(ax=ax, cmap=cmap, cbar_kwargs={'label': 'Clusters'})
ax.coastlines()
plt.show()
Visualize Clusters¶
# cluster 1
y_viz = x_norm[k_model.labels_ == 0]
fig, ax = plt.subplots()
ax.plot(y_viz.T, alpha=0.1, color='gray')
plt.show()
Training w/ Clusters¶
# get cluster centers
X_1 = x_norm[k_model.labels_ == 0]
y_1 = y_norm[k_model.labels_ == 0]
print(X.shape, y.shape, X_1.shape, y_1.shape)
# training and testing split
train_size = 0.7
random_state = 123
xtrain, xtest, ytrain, ytest = train_test_split(X_1, y_1, train_size=train_size, random_state=random_state)
ML - Random Forest Regressor¶
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(
n_estimators=1000,
criterion='mse',
n_jobs=-1,
random_state=123,
warm_start=True,
verbose=0
)
t0 = time.time()
rf_model.fit(xtrain, ytrain)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
ypred = rf_model.predict(xtest)
t1 = time.time() - t0
# Get Stats
mae = mean_absolute_error(ypred, ytest, multioutput='uniform_average')
mse = mean_squared_error(ypred, ytest, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(ypred, ytest , multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
print(r2)
fig, ax = plt.subplots()
#check this
plt.scatter(ypred, ytest,alpha=0.5, s=0.5)
plt.ylim(-2, 2), plt.xlim(-2,2)
plt.show()
# Get Stats
mae_raw = mean_absolute_error(ypred, ytest, multioutput='raw_values')
mse_raw = mean_squared_error(ypred, ytest, multioutput='raw_values')
rmse_raw = np.sqrt(mse_raw)
r2_raw = r2_score(ypred, ytest,multioutput='raw_values')
plot_depth_errors(r2_raw, mae_raw, mse_raw, rmse_raw)