Cluster-Based Learning¶
In this notebook, I will be looking at how we can use clustering methods to help us do regression. We will be looking at two approaches:
1. Clustering of the data and then training a model per cluster
2. Clustering only of the outputs and then training a model per cluster.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.utils import gen_even_slices
import time as time
import statsmodels.api as sm
import sys
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')
from data.make_dataset import DataLoad
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
Data¶
# data_path = '/media/disk/erc/papers/2019_ML_OCN/data/raph_temp_data_NA/'
# data_path = '/Users/eman/Documents/data/ocean/'
# Import data
dataloader = DataLoad()
X, y = dataloader.load_control_data('na')
X = X[dataloader.core_vars]
y = y.drop(dataloader.meta_vars, axis=1)
# y = np.exp(y)
xtrain, xtest, ytrain, ytest = train_test_split(
X, y, train_size=0.8, random_state=123
)
Standardize Data¶
# Standardize Inputs (per dimension)
x_mean, x_std = xtrain.mean(axis=0), xtrain.std(axis=0)
xtrain_norm = (xtrain - x_mean) / x_std
xtest_norm = (xtest - x_mean) / x_std
# Normalize Outputs
y_mean = ytrain.mean(axis=0)
ytrain_norm = ytrain - y_mean
ytest_norm = ytest - y_mean
Method I - Clustering the Inputs¶
clf = KMeans(init='k-means++', n_clusters=3, n_init=10, verbose=None)
clf.fit(xtrain_norm)
clusters = clf.predict(xtrain_norm)
# model 1
for imodel in np.unique(clusters):
print(f"Cluster: {imodel+1}")
# get subset of data which resides in cluster
ix = xtrain_norm[clusters == imodel]
iy = ytrain_norm[clusters == imodel]
# print(ix.shape, iy.shape)
# training and testing split
train_size = 0.8
random_state = 123
ixtrain, ixtest, iytrain, iytest = train_test_split(
ix, iy, train_size=train_size, random_state=random_state
)
# Standardize Inputs (per dimension)
x_mean, x_std = ixtrain.mean(axis=0), ixtrain.std(axis=0)
ixtrain_norm = (ixtrain - x_mean) / x_std
ixtest_norm = (ixtest - x_mean) / x_std
# Normalize Outputs
y_mean = iytrain.mean(axis=0)
iytrain_norm = iytrain - y_mean
iytest_norm = iytest - y_mean
# =======================
# PCA
# =======================
n_components = 20
pca_model = PCA(n_components=n_components)
iytrain_red = pca_model.fit_transform(iytrain_norm)
iytest_red = pca_model.transform(iytest_norm)
# =======================
# ML Algorithm
# =======================
rf_model = RandomForestRegressor(
n_estimators=1000,
criterion='mse',
n_jobs=-1,
random_state=123,
warm_start=False,
verbose=0
)
t0 = time.time()
rf_model.fit(ixtrain_norm, iytrain_red)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
iypred_red = rf_model.predict(ixtest_norm)
t1 = time.time() - t0
iypred = pca_model.inverse_transform(iypred_red)
# Get Average Stats
mae = mean_absolute_error(iytest_norm, iypred, multioutput='uniform_average')
mse = mean_squared_error(iytest_norm, iypred, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(iytest_norm, iypred, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
print("Done!\n")
Method II - Clustering the Outputs¶
clf = KMeans(init='k-means++', n_clusters=3, n_init=20, verbose=None)
clf.fit(ytrain_norm)
clusters = clf.predict(ytrain_norm)
# model 1
for imodel in np.unique(clusters):
print(f"Cluster: {imodel+1}")
# get subset of data which resides in cluster
ix = xtrain_norm[clusters == imodel]
iy = ytrain_norm[clusters == imodel]
# print(ix.shape, iy.shape)
# training and testing split
train_size = 0.8
random_state = 123
ixtrain, ixtest, iytrain, iytest = train_test_split(
ix, iy, train_size=train_size, random_state=random_state
)
print(ix.shape, iy.shape)
# Standardize Inputs (per dimension)
x_mean, x_std = ixtrain.mean(axis=0), ixtrain.std(axis=0)
ixtrain_norm = (ixtrain - x_mean) / x_std
ixtest_norm = (ixtest - x_mean) / x_std
# Normalize Outputs
y_mean = iytrain.mean(axis=0)
iytrain_norm = iytrain - y_mean
iytest_norm = iytest - y_mean
# =======================
# PCA
# =======================
n_components = 20
pca_model = PCA(n_components=n_components)
iytrain_red = pca_model.fit_transform(iytrain_norm)
iytest_red = pca_model.transform(iytest_norm)
# =======================
# ML Algorithm
# =======================
rf_model = RandomForestRegressor(
n_estimators=1000,
criterion='mse',
n_jobs=-1,
random_state=123,
warm_start=False,
verbose=0
)
t0 = time.time()
rf_model.fit(ixtrain_norm, iytrain_red)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
iypred_red = rf_model.predict(ixtest_norm)
t1 = time.time() - t0
iypred = pca_model.inverse_transform(iypred_red)
# Get Average Stats
mae = mean_absolute_error(iytest_norm, iypred, multioutput='uniform_average')
mse = mean_squared_error(iytest_norm, iypred, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(iytest_norm, iypred, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
print("Done!\n")
Method III - Binning the Outputs¶
ytrain_norm.values[:, :75].shape
intervals = [
(0, 90), (90, 180), (180, 276)
]
for i in gen_even_slices(ytrain_norm.shape[1], 10):
print(i)
break
# model 1
for idx in gen_even_slices(ytrain_norm.shape[1], 10):
print(f"Cluster: {idx}")
# get subset of data which resides in cluster
ix = xtrain_norm
iy = ytrain_norm.values[:, idx]
print(ix.shape, iy.shape)
# print(ix.shape, iy.shape)
# training and testing split
train_size = 0.8
random_state = 123
ixtrain, ixtest, iytrain, iytest = train_test_split(
ix, iy, train_size=train_size, random_state=random_state
)
# Standardize Inputs (per dimension)
x_mean, x_std = ixtrain.mean(axis=0), ixtrain.std(axis=0)
ixtrain_norm = (ixtrain - x_mean) / x_std
ixtest_norm = (ixtest - x_mean) / x_std
# Normalize Outputs
y_mean = iytrain.mean(axis=0)
iytrain_norm = iytrain - y_mean
iytest_norm = iytest - y_mean
# =======================
# PCA
# =======================
n_components = 20
pca_model = PCA(n_components=n_components)
iytrain_red = pca_model.fit_transform(iytrain_norm)
iytest_red = pca_model.transform(iytest_norm)
# =======================
# ML Algorithm
# =======================
rf_model = RandomForestRegressor(
n_estimators=1000,
criterion='mse',
n_jobs=-1,
random_state=123,
warm_start=False,
verbose=0
)
t0 = time.time()
rf_model.fit(ixtrain_norm, iytrain_red)
t1 = time.time() - t0
print(
f"Training Time: {t1:.3f} seconds"
)
# Predictions
t0 = time.time()
iypred_red = rf_model.predict(ixtest_norm)
t1 = time.time() - t0
iypred = pca_model.inverse_transform(iypred_red)
# Get Average Stats
mae = mean_absolute_error(iytest_norm, iypred, multioutput='uniform_average')
mse = mean_squared_error(iytest_norm, iypred, multioutput='uniform_average')
rmse = np.sqrt(mse)
r2 = r2_score(iytest_norm, iypred, multioutput='uniform_average')
print(
f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}"
f" \nTime: {t1:.3} seconds"
)
print("Done!\n")