Module src.models.baseline

Expand source code
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (WhiteKernel, ConstantKernel, RBF, Matern, ExpSineSquared, RationalQuadratic)
from sklearn.linear_model import MultiTaskElasticNetCV, LinearRegression, RidgeCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.base import BaseEstimator
from typing import Optional, Dict, Union, Tuple
from sklearn.ensemble import StackingRegressor
import numpy as np
import pandas as pd
import time

import warnings

warnings.filterwarnings("ignore")


def train_stack_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
    order: Tuple[str, str] = ("rf", "lr"),
    lr_params: Optional[Dict]=None,
    rf_params: Optional[Dict]=None
) -> BaseEstimator:

    rf_estimator = RandomForestRegressor(
        n_estimators=1_000,
        criterion="mse",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )
    lr_estimator = LinearRegression()

    # Initialize GLM
    if order == ("rf", "lr"):
        stacking_regressor = StackingRegressor(
            estimators=[("Random Forest", rf_estimator)], final_estimator=lr_estimator
        )
    elif order == ("lr", "rf"):
        stacking_regressor = StackingRegressor(
            estimators=[("Linear Regression", lr_estimator)],
            final_estimator=rf_estimator,
        )
    else:
        raise ValueError()

    mo_regressor = MultiOutputRegressor(stacking_regressor, n_jobs=1)
    # train GLM
    t0 = time.time()
    mo_regressor.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_regressor


def train_lr_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
) -> BaseEstimator:
    # Initialize GLM
    lr_model = LinearRegression(n_jobs=n_jobs)

    # train GLM
    t0 = time.time()
    lr_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return lr_model


def train_gp_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    params,
) -> BaseEstimator:

    # define kernel function
    init_length_scale = np.ones(xtrain.shape[1])
    kernel = (
        ConstantKernel() * Matern(nu=2.5, length_scale=init_length_scale)
        + ConstantKernel() * RationalQuadratic(alpha=10, length_scale=1.0)
        + ConstantKernel() * RBF(length_scale=init_length_scale)
        + WhiteKernel(noise_level=0.01)
    )

    # define GP model
    gp_model = GaussianProcessRegressor(
        kernel=kernel,
        **params
    )

    # train GP Model
    t0 = time.time()
    gp_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if params['verbose'] > 0:
        print(f"Training time: {t1:.3f} secs.")
    return gp_model


def train_ridge_lr_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
) -> BaseEstimator:
    # Initialize GLM
    lr_model = RidgeCV()

    # train GLM
    t0 = time.time()
    lr_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return lr_model


def train_glm_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
) -> BaseEstimator:
    """Train a basic Generalized Linear Model (GLM)

    Parameters
    ----------
    xtrain : np.ndarray, pd.DataFrame 
             (n_samples x d_features)
             input training data
    
    ytrain : np.ndarray, pd.DataFrame 
             (n_samples x p_outputs)
             labeled training data 
    
    verbose : int, default=0
        option to print out training messages 

    Returns 
    -------
    gl_model : BaseEstimator
        the trained model
    """
    # Initialize GLM
    gl_model = MultiTaskElasticNetCV(
        alphas=None,
        cv=3,
        random_state=123,
        n_jobs=-1,
        normalize=False,
        selection="random",
        verbose=verbose,
    )

    # train GLM
    t0 = time.time()
    gl_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return gl_model


def train_mlp_model(xtrain, ytrain, params):

    # Initialize MLP
    mlp_model = MLPRegressor(
        **params
    )

    # train GLM
    t0 = time.time()
    mlp_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if params['verbose'] > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mlp_model


def train_rf_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    params
) -> BaseEstimator:
    """Train a basic Random Forest (RF) Regressor 

    Parameters
    ----------
    xtrain : np.ndarray, pd.DataFrame 
             (n_samples x d_features)
             input training data
    
    ytrain : np.ndarray, pd.DataFrame 
             (n_samples x p_outputs)
             labeled training data 
    
    verbose : int, default=0
        option to print out training messages 

    Returns 
    -------
    rf_model : BaseEstimator
        the trained model
    """
    # initialize baseline RF model
    rf_model = RandomForestRegressor(
        **params
    )
    # train RF model
    t0 = time.time()
    rf_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if params['verbose'] > 0:
        print(f"Training time: {t1:.3f} secs.")
    return rf_model


def train_mo_rf_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 8,
    mo_jobs: int = 8,
) -> BaseEstimator:
    # initialize baseline RF model
    rf_model = RandomForestRegressor(
        n_estimators=1_000,
        criterion="mae",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )

    # initialize multioutput regressor
    mo_model = MultiOutputRegressor(estimator=rf_model, n_jobs=mo_jobs)
    # train RF model
    t0 = time.time()
    mo_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_model


def train_mo_gbt_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 8,
    mo_jobs: int = 8,
) -> BaseEstimator:
    # initialize baseline RF model
    rf_model = RandomForestRegressor(
        n_estimators=100,
        criterion="mae",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )

    # initialize multioutput regressor
    mo_model = MultiOutputRegressor(estimator=rf_model, n_jobs=mo_jobs)
    # train RF model
    t0 = time.time()
    mo_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_model

Functions

def train_glm_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], verbose: int = 0) -> sklearn.base.BaseEstimator

Train a basic Generalized Linear Model (GLM)

Parameters

xtrain : np.ndarray, pd.DataFrame
(n_samples x d_features) input training data
ytrain : np.ndarray, pd.DataFrame
(n_samples x p_outputs) labeled training data
verbose : int, default=0
option to print out training messages

Returns

gl_model : BaseEstimator the trained model

Expand source code
def train_glm_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
) -> BaseEstimator:
    """Train a basic Generalized Linear Model (GLM)

    Parameters
    ----------
    xtrain : np.ndarray, pd.DataFrame 
             (n_samples x d_features)
             input training data
    
    ytrain : np.ndarray, pd.DataFrame 
             (n_samples x p_outputs)
             labeled training data 
    
    verbose : int, default=0
        option to print out training messages 

    Returns 
    -------
    gl_model : BaseEstimator
        the trained model
    """
    # Initialize GLM
    gl_model = MultiTaskElasticNetCV(
        alphas=None,
        cv=3,
        random_state=123,
        n_jobs=-1,
        normalize=False,
        selection="random",
        verbose=verbose,
    )

    # train GLM
    t0 = time.time()
    gl_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return gl_model
def train_gp_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], params) -> sklearn.base.BaseEstimator
Expand source code
def train_gp_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    params,
) -> BaseEstimator:

    # define kernel function
    init_length_scale = np.ones(xtrain.shape[1])
    kernel = (
        ConstantKernel() * Matern(nu=2.5, length_scale=init_length_scale)
        + ConstantKernel() * RationalQuadratic(alpha=10, length_scale=1.0)
        + ConstantKernel() * RBF(length_scale=init_length_scale)
        + WhiteKernel(noise_level=0.01)
    )

    # define GP model
    gp_model = GaussianProcessRegressor(
        kernel=kernel,
        **params
    )

    # train GP Model
    t0 = time.time()
    gp_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if params['verbose'] > 0:
        print(f"Training time: {t1:.3f} secs.")
    return gp_model
def train_lr_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], verbose: int = 0, n_jobs: int = 1) -> sklearn.base.BaseEstimator
Expand source code
def train_lr_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
) -> BaseEstimator:
    # Initialize GLM
    lr_model = LinearRegression(n_jobs=n_jobs)

    # train GLM
    t0 = time.time()
    lr_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return lr_model
def train_mlp_model(xtrain, ytrain, params)
Expand source code
def train_mlp_model(xtrain, ytrain, params):

    # Initialize MLP
    mlp_model = MLPRegressor(
        **params
    )

    # train GLM
    t0 = time.time()
    mlp_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if params['verbose'] > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mlp_model
def train_mo_gbt_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], verbose: int = 0, n_jobs: int = 8, mo_jobs: int = 8) -> sklearn.base.BaseEstimator
Expand source code
def train_mo_gbt_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 8,
    mo_jobs: int = 8,
) -> BaseEstimator:
    # initialize baseline RF model
    rf_model = RandomForestRegressor(
        n_estimators=100,
        criterion="mae",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )

    # initialize multioutput regressor
    mo_model = MultiOutputRegressor(estimator=rf_model, n_jobs=mo_jobs)
    # train RF model
    t0 = time.time()
    mo_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_model
def train_mo_rf_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], verbose: int = 0, n_jobs: int = 8, mo_jobs: int = 8) -> sklearn.base.BaseEstimator
Expand source code
def train_mo_rf_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 8,
    mo_jobs: int = 8,
) -> BaseEstimator:
    # initialize baseline RF model
    rf_model = RandomForestRegressor(
        n_estimators=1_000,
        criterion="mae",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )

    # initialize multioutput regressor
    mo_model = MultiOutputRegressor(estimator=rf_model, n_jobs=mo_jobs)
    # train RF model
    t0 = time.time()
    mo_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_model
def train_rf_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], params) -> sklearn.base.BaseEstimator

Train a basic Random Forest (RF) Regressor

Parameters

xtrain : np.ndarray, pd.DataFrame
(n_samples x d_features) input training data
ytrain : np.ndarray, pd.DataFrame
(n_samples x p_outputs) labeled training data
verbose : int, default=0
option to print out training messages

Returns

rf_model : BaseEstimator the trained model

Expand source code
def train_rf_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    params
) -> BaseEstimator:
    """Train a basic Random Forest (RF) Regressor 

    Parameters
    ----------
    xtrain : np.ndarray, pd.DataFrame 
             (n_samples x d_features)
             input training data
    
    ytrain : np.ndarray, pd.DataFrame 
             (n_samples x p_outputs)
             labeled training data 
    
    verbose : int, default=0
        option to print out training messages 

    Returns 
    -------
    rf_model : BaseEstimator
        the trained model
    """
    # initialize baseline RF model
    rf_model = RandomForestRegressor(
        **params
    )
    # train RF model
    t0 = time.time()
    rf_model.fit(xtrain, ytrain)
    t1 = time.time() - t0

    if params['verbose'] > 0:
        print(f"Training time: {t1:.3f} secs.")
    return rf_model
def train_ridge_lr_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], verbose: int = 0, n_jobs: int = 1) -> sklearn.base.BaseEstimator
Expand source code
def train_ridge_lr_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
) -> BaseEstimator:
    # Initialize GLM
    lr_model = RidgeCV()

    # train GLM
    t0 = time.time()
    lr_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return lr_model
def train_stack_model(xtrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], ytrain: Union[numpy.ndarray, pandas.core.frame.DataFrame], verbose: int = 0, n_jobs: int = 1, order: Tuple[str, str] = ('rf', 'lr'), lr_params: Union[Dict, NoneType] = None, rf_params: Union[Dict, NoneType] = None) -> sklearn.base.BaseEstimator
Expand source code
def train_stack_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
    n_jobs: int = 1,
    order: Tuple[str, str] = ("rf", "lr"),
    lr_params: Optional[Dict]=None,
    rf_params: Optional[Dict]=None
) -> BaseEstimator:

    rf_estimator = RandomForestRegressor(
        n_estimators=1_000,
        criterion="mse",
        n_jobs=n_jobs,
        random_state=123,
        warm_start=False,
        verbose=verbose,
    )
    lr_estimator = LinearRegression()

    # Initialize GLM
    if order == ("rf", "lr"):
        stacking_regressor = StackingRegressor(
            estimators=[("Random Forest", rf_estimator)], final_estimator=lr_estimator
        )
    elif order == ("lr", "rf"):
        stacking_regressor = StackingRegressor(
            estimators=[("Linear Regression", lr_estimator)],
            final_estimator=rf_estimator,
        )
    else:
        raise ValueError()

    mo_regressor = MultiOutputRegressor(stacking_regressor, n_jobs=1)
    # train GLM
    t0 = time.time()
    mo_regressor.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return mo_regressor