0 baseline control
import sys
import numpy as np
sys.path.insert(0, '/home/emmanuel/projects/2020_ml_ocn/ml4ocean/src')
from data.make_dataset import DataLoad
Helper Functions¶
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def get_ml_stats(ypred, ytest, stat='r2', output='avg'):
# each sample or average
if output == 'avg':
multioutput = 'uniform_average'
elif output == 'raw':
multioutput = 'raw_values'
elif output == 'var':
multioutput = 'variance_weighted'
else:
raise ValueError(f"Unrecognized output param: {output}")
if stat == 'r2':
res = r2_score(ytest, ypred, multioutput=multioutput)
elif stat == 'mse':
res = mean_squared_error(ytest, ypred, multioutput=multioutput)
elif stat == 'mae':
res = mean_absolute_error(ytest, ypred, multioutput=multioutput)
elif stat == 'rmse':
res = mean_squared_error(ytest, ypred, multioutput=multioutput)
res = np.sqrt(res)
else:
raise ValueError(f"Unrecognized stat param: {stat}")
return res
def print_stats(ypred, ytest, transformer=None):
if transformer is not None:
ypred = transformer.inverse_transform(ypred)
ytest = transformer.inverse_transform(ytest)
for istat in ['r2', 'mse', 'mae', 'rmse']:
print(f"{istat.upper()}: {get_ml_stats(ypred, ytest, stat=istat, output='avg')}")
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_stat_levels(ypred, ytest, stat='r2', transformer=None):
if transformer is not None:
ypred = transformer.inverse_transform(ypred)
ytest = transformer.inverse_transform(ytest)
res = get_ml_stats(ypred, ytest, stat=stat, output='raw')
fig, ax = plt.subplots()
ax.plot(res)
plt.show()
dataloader = DataLoad()
X, y = dataloader.load_control_data('na')
X = X[dataloader.core_vars]
y = np.log(y.drop(dataloader.meta_vars, axis=1))
Train Testin Split¶
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
X.values, y.values, train_size=0.8, random_state=123
)
Normalization¶
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# normalize X
x_scaler = StandardScaler()
xtrain_norm = x_scaler.fit_transform(xtrain)
xtest_norm = x_scaler.transform(xtest)
# decompose Y
n_components = 10
random_state = 123
# y_scaler = PCA(n_components=n_components, random_state=random_state)
y_scaler = StandardScaler(with_std=False)
ytrain_norm = y_scaler.fit_transform(ytrain)
ytest_norm = y_scaler.transform(ytest)
Baseline ML Models¶
Linear Regression¶
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
# lin_model = TransformedTargetRegressor(
# LinearRegression(n_jobs=-1),
# transformer=y_scaler,
# check_inverse=False,
# )
lin_model = LinearRegression(n_jobs=-1)
lin_model.fit(xtrain_norm, ytrain_norm);
ypred = lin_model.predict(xtest_norm)
Validation Set¶
yvalid = lin_model.predict(xtrain_norm)
print_stats(yvalid, ytrain)
plot_stat_levels(yvalid, ytrain, stat='mae')
plot_stat_levels(yvalid, ytrain, stat='mse')
plot_stat_levels(yvalid, ytrain, stat='rmse')
plot_stat_levels(yvalid, ytrain, stat='r2')
plot_stat_levels(ypred, ytest, stat='mae', )
plot_stat_levels(ypred, ytest, stat='r2', )
Random Forest Regression¶
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
rf_clf = RandomForestRegressor(
bootstrap=True, criterion='mse', max_depth=100,
max_features=4, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=4,
min_weight_fraction_leaf=0.0, n_estimators=1000,
n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False
)
rf_clf.fit(xtrain_norm, ytrain)
Validation Set¶
yvalid = rf_clf.predict(xtrain_norm)
print_stats(yvalid, ytrain)
plot_stat_levels(yvalid, ytrain, stat='mae')
plot_stat_levels(yvalid, ytrain, stat='mse')
plot_stat_levels(yvalid, ytrain, stat='rmse')
plot_stat_levels(yvalid, ytrain, stat='r2')
Test Set¶
ypred = rf_clf.predict(xtest_norm)
print_stats(ypred, ytest)
plot_stat_levels(ypred, ytest, stat='mae')
plot_stat_levels(ypred, ytest, stat='mse')
plot_stat_levels(ypred, ytest, stat='rmse')
plot_stat_levels(ypred, ytest, stat='r2')
Cross Validated¶
# parameter space
n_estimators = [int(x) for x in np.linspace(200, 2_000, 10)]
max_features = sp_randint(1, 8)
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
min_samples_split = sp_randint(2, 11)
min_samples_leaf = sp_randint(1, 4)
bootstrap = [True, False]
criterion = ['gini', 'entropy', 'mse']
param_grid = {
'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap
}
rf_clf = RandomForestRegressor()
# grid search params
n_iter = 1_000
cv = 3
verbose = 2
random_state = 123
n_jobs = -1
rf_rand = RandomizedSearchCV(
estimator=rf_clf,
param_distributions=param_grid,
n_iter=n_iter,
cv=cv,
verbose=verbose,
random_state=random_state,
n_jobs=n_jobs
)
# fit to model
rf_rand.fit(xtrain_norm, ytrain_norm);
rf_rand.best_estimator_
rf_rand.best_estimator_
ypred = rf_rand.predict(xtest_norm)
ypred = rf_clf.predict(xtest_norm)
ypred.shape, ytest_norm.shape
print_stats(ypred, ytest_norm)
plot_stat_levels(ypred, ytest_norm)
plot_stat_levels(ypred, ytest_norm, stat='r2', transformer=y_scaler)
Statistics¶
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def get_ml_stats(ypred, ytest, stat='r2', output='avg'):
# each sample or average
if output == 'avg':
multioutput = 'uniform_average'
elif output == 'raw':
multioutput = 'raw_values'
elif output == 'var':
multioutput = 'variance_weighted'
else:
raise ValueError(f"Unrecognized output param: {output}")
if stat == 'r2':
res = r2_score(ypred, ytest, multioutput=multioutput)
elif stat == 'mse':
res = mean_squared_error(ypred, ytest, multioutput=multioutput)
elif stat == 'mae':
res = mean_absolute_error(ypred, ytest, multioutput=multioutput)
elif stat == 'rmse':
res = mean_squared_error(ypred, ytest, multioutput=multioutput)
res = np.sqrt(res)
else:
raise ValueError(f"Unrecognized stat param: {stat}")
return res
def print_stats(ypred, ytest, transformer=None):
if transformer is not None:
ypred = transformer.inverse_transform(ypred)
ytest = transformer.inverse_transform(ytest)
for istat in ['r2', 'mse', 'mae', 'rmse']:
print(f"{istat.upper()}: {get_ml_stats(ypred, ytest, stat=istat, output='avg')}")
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_stat_levels(ypred, ytest, stat='r2', transformer=None):
if transformer is not None:
ypred = transformer.inverse_transform(ypred)
ytest = transformer.inverse_transform(ytest)
res = get_ml_stats(ypred, ytest, stat=stat, output='raw')
fig, ax = plt.subplots()
ax.plot(res)
plt.show()