Module `src.features.build_features`

Expand source code

import numpy as np
from typing import List, Optional
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, Polygon
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

# Datasets
from src.data.make_dataset import (
    DataLoader,
    load_standard_data,
    load_high_dim_data,
    load_labels,
)


class ProcessParams:
    n_components = 5
    valid_split = 0.2
    standardize = "before"
    seed = 123
    bootstrap_seed = 123


class CycleTransform(BaseEstimator, TransformerMixin):
    """Converts some times to a cyclic axis of x, y using sin and cos. 
    
    
    1. Converts the times to radians 
    2. Normalizes by the maximum of the time cycle
    3. Applies the sin and cosine transformation
    4. Drops original columns

    Parameters 
    ----------
    time_types : List of , e.g. ['doy', 'month', 'hour']
        The time type to convert to a cycle
        doy - assumes 1 in 24 hours

    Example
    -------
    >> times = ['doy']
    >> X = CycleTransform(times).fit_transform(X)

    >> times = ['doy', 'month']
    >> X = CycleTransform(times).fit_transform(X)
    """

    def __init__(self, time_types: List[str] = ["doy"]):
        self.time_types = time_types

    def fit(self, X, y=None):
        """For compatibility reasons."""
        return self

    def transform(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        """
        Parameters 
        ----------
        X : pd.DataFrame
            A dataframe with the values. The columns need to be one of the following
            ['doy', 'month', 'hour']
        
        y : pd.DataFrame, Optional
            Does nothing. Only for compatibility reasons.
        
        Returns
        -------
        df : pd.DataFrame
            A dataframe with the converted values.
        """
        deg2rad = 2 * np.pi

        cols = X.columns.tolist()

        if "doy" in self.time_types and "doy" in cols:

            const = 365.0  # number of days in a year

            X["doy_sin"] = np.sin(X["doy"] * deg2rad / const)
            X["doy_cos"] = np.cos(X["doy"] * deg2rad / const)

            X = X.drop("doy", axis=1)

        if "month" in self.time_types and "month" in cols:

            const = 12  # number of months in a year

            X["month_sin"] = np.sin((X["month"] - 1) * deg2rad / const)
            X["month_cos"] = np.cos((X["month"] - 1) * deg2rad / const)

            X = X.drop("month", axis=1)

        if "hour" in self.time_types and "hour" in cols:

            const = 24.0  # number of days in a year

            X["hour_sin"] = np.sin(X["hour"] * deg2rad / const)
            X["hour_cos"] = np.cos(X["hour"] * deg2rad / const)

            X = X.drop("hour", axis=1)

        # drop original column

        return X


class GeoCartTransform(BaseEstimator, TransformerMixin):
    """Transforms geo coordinates (lat, lon) to cartesian coordinates
    (x, y, z).
    
    Example
    -------
    >> df = geo_2_cartesian(df)
    """

    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """    
        Parameters 
        ----------
        df : pd.DataFrame
            A dataframe with the geo coordinates values. The columns need to 
            have the following ['lat', 'lon]
        
        Returns
        -------
        df : pd.DataFrame
            A dataframe with the converted values.
        """
        cols = X.columns.tolist()

        if "lat" not in cols or "lon" not in cols:
            print("lat,lon columns not present in X.")
            return X

        deg2rad = np.pi / 180.0

        # transform from degrees to radians
        X["lat"] *= deg2rad
        X["lon"] *= deg2rad

        # From Geo coords to cartesian coords
        X["x"] = np.cos(X["lat"]) * np.cos(X["lon"])
        X["y"] = np.cos(X["lat"]) * np.sin(X["lon"])
        X["z"] = np.sin(X["lat"])

        # drop original columns
        X = X.drop(["lat", "lon"], axis=1)

        return X


def get_geodataframe(dataframe: pd.DataFrame) -> gpd.GeoDataFrame:
    """This function will transform the dataset from a 
    pandas.DataFrame to a geopandas.DataFrame which will
    have a special column for geometry. This will make plotting 
    a lot easier."""
    # get polygons
    geometry = [Point(xy) for xy in zip(dataframe["lon"], dataframe["lat"])]

    # coordinate systems
    crs = {"init": "epsg:4326"}

    # create dataframe
    gpd_df = gpd.GeoDataFrame(dataframe, crs=crs, geometry=geometry)
    return gpd_df


def geo_2_cartesian(df: pd.DataFrame) -> pd.DataFrame:
    """Transforms geo coordinates (lat, lon) to cartesian coordinates
    (x, y, z).
    
    Parameters 
    ----------
    df : pd.DataFrame
        A dataframe with the geo coordinates values. The columns need to 
        have the following ['lat', 'lon]
    
    Returns
    -------
    df : pd.DataFrame
        A dataframe with the converted values.

    Example
    -------
    >> df = geo_2_cartesian(df)
    """
    cols = df.columns.tolist()

    if "lat" not in cols or "lon" not in cols:
        print("lat,lon columns not present in df.")
        return df

    deg2rad = np.pi / 180.0

    # transform from degrees to radians
    df["lat"] *= deg2rad
    df["lon"] *= deg2rad

    # From Geo coords to cartesian coords
    df["x"] = np.cos(df["lat"]) * np.cos(df["lon"])
    df["y"] = np.cos(df["lat"]) * np.sin(df["lon"])
    df["z"] = np.sin(df["lat"])

    # drop original columns
    df = df.drop(["lat", "lon"], axis=1)

    return df


def times_2_cycles(df: pd.DataFrame, time_types: List[str] = ["doy"]) -> pd.DataFrame:
    """Converts some times to a cyclic axis of x, y using sin and cos. 
    
    
    1. Converts the times to radians 
    2. Normalizes by the maximum of the time cycle
    3. Applies the sin and cosine transformation
    4. Drops original columns

    Parameters 
    ----------
    df : pd.DataFrame
        A dataframe with the values. The columns need to be one of the following
        ['doy', 'month', 'hour']
    
    time_types : List of , e.g. ['doy', 'month', 'hour']
        The time type to convert to a cycle
        doy - assumes 1 in 24 hours
    
    Returns
    -------
    df : pd.DataFrame
        A dataframe with the converted values.

    Example
    -------
    >> times = ['doy']
    >> df = time_2_cycle(df, times)

    >> times = ['doy', 'month']
    >> df = times_2_cycles(df, times)
    """
    deg2rad = 2 * np.pi

    cols = X.columns.tolist()

    if "doy" in time_types and "doy" in cols:

        const = 365.0  # number of days in a year

        df["doy_sin"] = np.sin(df["doy"] * deg2rad / const)
        df["doy_cos"] = np.cos(df["doy"] * deg2rad / const)

        df = df.drop("doy", axis=1)

    if "month" in time_types and "month" in cols:

        const = 12  # number of months in a year

        df["month_sin"] = np.sin((df["month"] - 1) * deg2rad / const)
        df["month_cos"] = np.cos((df["month"] - 1) * deg2rad / const)

        df = df.drop("month", axis=1)

    if "hour" in time_types and "hour" in cols:

        const = 24.0  # number of days in a year

        df["hour_sin"] = np.sin(df["hour"] * deg2rad / const)
        df["hour_cos"] = np.cos(df["hour"] * deg2rad / const)

        df = df.drop("hour", axis=1)

    # drop original column

    return df


def run_input_preprocess(params, dataset):

    # get columns
    dataloader = DataLoader()

    columns = dataloader.load_columns()

    new_columns = [
        *["doy_cos", "doy_sin"],
        *["x", "y", "z"],
        *[f"temperature_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *[f"density_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *[f"salinity_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *[f"spicy_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *columns["core"],
    ]
    # print(columns["temperature"])
    # define transfomer
    if params.input_std == "before":
        X_pre_transformer = ColumnTransformer(
            [
                ("time", CycleTransform(columns["time"]), columns["time"]),
                ("location", GeoCartTransform(), columns["location"]),
                (
                    "temperature",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["temperature"],
                ),
                (
                    "density",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["density"],
                ),
                (
                    "salinity",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["salinity"],
                ),
                (
                    "spicy",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["spicy"],
                ),
                (
                    "core",
                    StandardScaler(with_mean=True, with_std=True),
                    columns["core"],
                ),
            ],
            remainder="passthrough",
        )
    elif params.input_std == "after":
        X_pre_transformer = ColumnTransformer(
            [
                ("time", CycleTransform(columns["time"]), columns["time"]),
                ("location", GeoCartTransform(), columns["location"]),
                (
                    "temperature",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["temperature"],
                ),
                (
                    "density",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["density"],
                ),
                (
                    "salinity",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["salinity"],
                ),
                (
                    "spicy",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["spicy"],
                ),
            ],
            remainder="passthrough",
        )
    else:
        raise ValueError(f"Unrecognized standardize param: {params.standardize}")

    # transform data
    t = X_pre_transformer.fit_transform(dataset["Xtrain"])
    dataset["Xtrain"] = X_pre_transformer.fit_transform(dataset["Xtrain"])
    dataset["Xtest"] = X_pre_transformer.transform(dataset["Xtest"])
    dataset["input_pre_trans"] = X_pre_transformer
    dataset["new_columns"] = new_columns
    return dataset


def run_input_postprocess(params, dataset):

    # initialize transfomer

    X_post_transformer = StandardScaler(with_mean=True, with_std=True)

    # data

    dataset["Xtrain"] = X_post_transformer.fit_transform(dataset["Xtrain"])
    dataset["Xtest"] = X_post_transformer.transform(dataset["Xtest"])
    dataset["Xvalid"] = X_post_transformer.transform(dataset["Xvalid"])
    dataset["input_post_trans"] = X_post_transformer
    return dataset


def run_output_preprocess(params, dataset):

    # data = {}
    # dataset["ytrain"] = np.log(dataset["ytrain"])
    # dataset["ytest"] = np.log(dataset["ytest"])
    # dataset["out_pre_trans"] = np.log
    return dataset


def run_output_postprocess(params, dataset):

    if params.std_ouputs == True:

        def loginv(x):
            return 10 ** x

        dataset["out_post_trans"] = Pipeline(
            [
                ("log", FunctionTransformer(func=np.log10, inverse_func=loginv)),
                ("scale", StandardScaler()),
            ]
        )
    elif params.std_ouputs == False:
        dataset["out_post_trans"] = Pipeline([("scale", StandardScaler())])
    else:
        raise ValueError(f"Unrecognized params.std_ouputs: {params.std_ouputs}")

    columns = dataset["ytrain"].columns

    dataset["ytrain"] = pd.DataFrame(
        dataset["out_post_trans"].fit_transform(dataset["ytrain"]), columns=columns
    )
    dataset["ytest"] = pd.DataFrame(
        dataset["out_post_trans"].transform(dataset["ytest"]), columns=columns
    )
    dataset["yvalid"] = pd.DataFrame(
        dataset["out_post_trans"].transform(dataset["yvalid"]), columns=columns
    )

    return dataset


def run_split(params, dataset):
    Xtrain, Xvalid, ytrain, yvalid = train_test_split(
        dataset["Xtrain"],
        dataset["ytrain"],
        train_size=1 - params.valid_split,
        random_state=params.bootstrap_seed,
    )

    dataset["Xtrain"] = Xtrain
    dataset["Xvalid"] = Xvalid
    dataset["ytrain"] = ytrain
    dataset["yvalid"] = yvalid
    return dataset

Functions

def geo_2_cartesian(df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame

Transforms geo coordinates (lat, lon) to cartesian coordinates (x, y, z).

Parameters

df : pd.DataFrame A dataframe with the geo coordinates values. The columns need to have the following ['lat', 'lon]

Returns

df : pd.DataFrame: A dataframe with the converted values.

Example

df = geo_2_cartesian(df)

Expand source code

def geo_2_cartesian(df: pd.DataFrame) -> pd.DataFrame:
    """Transforms geo coordinates (lat, lon) to cartesian coordinates
    (x, y, z).
    
    Parameters 
    ----------
    df : pd.DataFrame
        A dataframe with the geo coordinates values. The columns need to 
        have the following ['lat', 'lon]
    
    Returns
    -------
    df : pd.DataFrame
        A dataframe with the converted values.

    Example
    -------
    >> df = geo_2_cartesian(df)
    """
    cols = df.columns.tolist()

    if "lat" not in cols or "lon" not in cols:
        print("lat,lon columns not present in df.")
        return df

    deg2rad = np.pi / 180.0

    # transform from degrees to radians
    df["lat"] *= deg2rad
    df["lon"] *= deg2rad

    # From Geo coords to cartesian coords
    df["x"] = np.cos(df["lat"]) * np.cos(df["lon"])
    df["y"] = np.cos(df["lat"]) * np.sin(df["lon"])
    df["z"] = np.sin(df["lat"])

    # drop original columns
    df = df.drop(["lat", "lon"], axis=1)

    return df

def get_geodataframe(dataframe: pandas.core.frame.DataFrame) -> geopandas.geodataframe.GeoDataFrame

This function will transform the dataset from a pandas.DataFrame to a geopandas.DataFrame which will have a special column for geometry. This will make plotting a lot easier.

Expand source code

def get_geodataframe(dataframe: pd.DataFrame) -> gpd.GeoDataFrame:
    """This function will transform the dataset from a 
    pandas.DataFrame to a geopandas.DataFrame which will
    have a special column for geometry. This will make plotting 
    a lot easier."""
    # get polygons
    geometry = [Point(xy) for xy in zip(dataframe["lon"], dataframe["lat"])]

    # coordinate systems
    crs = {"init": "epsg:4326"}

    # create dataframe
    gpd_df = gpd.GeoDataFrame(dataframe, crs=crs, geometry=geometry)
    return gpd_df

def run_input_postprocess(params, dataset)

Expand source code

def run_input_postprocess(params, dataset):

    # initialize transfomer

    X_post_transformer = StandardScaler(with_mean=True, with_std=True)

    # data

    dataset["Xtrain"] = X_post_transformer.fit_transform(dataset["Xtrain"])
    dataset["Xtest"] = X_post_transformer.transform(dataset["Xtest"])
    dataset["Xvalid"] = X_post_transformer.transform(dataset["Xvalid"])
    dataset["input_post_trans"] = X_post_transformer
    return dataset

def run_input_preprocess(params, dataset)

Expand source code

def run_input_preprocess(params, dataset):

    # get columns
    dataloader = DataLoader()

    columns = dataloader.load_columns()

    new_columns = [
        *["doy_cos", "doy_sin"],
        *["x", "y", "z"],
        *[f"temperature_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *[f"density_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *[f"salinity_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *[f"spicy_pc{icomponent+1}" for icomponent in range(params.n_components)],
        *columns["core"],
    ]
    # print(columns["temperature"])
    # define transfomer
    if params.input_std == "before":
        X_pre_transformer = ColumnTransformer(
            [
                ("time", CycleTransform(columns["time"]), columns["time"]),
                ("location", GeoCartTransform(), columns["location"]),
                (
                    "temperature",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["temperature"],
                ),
                (
                    "density",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["density"],
                ),
                (
                    "salinity",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["salinity"],
                ),
                (
                    "spicy",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["spicy"],
                ),
                (
                    "core",
                    StandardScaler(with_mean=True, with_std=True),
                    columns["core"],
                ),
            ],
            remainder="passthrough",
        )
    elif params.input_std == "after":
        X_pre_transformer = ColumnTransformer(
            [
                ("time", CycleTransform(columns["time"]), columns["time"]),
                ("location", GeoCartTransform(), columns["location"]),
                (
                    "temperature",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["temperature"],
                ),
                (
                    "density",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["density"],
                ),
                (
                    "salinity",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["salinity"],
                ),
                (
                    "spicy",
                    PCA(n_components=params.n_components, random_state=params.pca_seed),
                    columns["spicy"],
                ),
            ],
            remainder="passthrough",
        )
    else:
        raise ValueError(f"Unrecognized standardize param: {params.standardize}")

    # transform data
    t = X_pre_transformer.fit_transform(dataset["Xtrain"])
    dataset["Xtrain"] = X_pre_transformer.fit_transform(dataset["Xtrain"])
    dataset["Xtest"] = X_pre_transformer.transform(dataset["Xtest"])
    dataset["input_pre_trans"] = X_pre_transformer
    dataset["new_columns"] = new_columns
    return dataset

def run_output_postprocess(params, dataset)

Expand source code

def run_output_postprocess(params, dataset):

    if params.std_ouputs == True:

        def loginv(x):
            return 10 ** x

        dataset["out_post_trans"] = Pipeline(
            [
                ("log", FunctionTransformer(func=np.log10, inverse_func=loginv)),
                ("scale", StandardScaler()),
            ]
        )
    elif params.std_ouputs == False:
        dataset["out_post_trans"] = Pipeline([("scale", StandardScaler())])
    else:
        raise ValueError(f"Unrecognized params.std_ouputs: {params.std_ouputs}")

    columns = dataset["ytrain"].columns

    dataset["ytrain"] = pd.DataFrame(
        dataset["out_post_trans"].fit_transform(dataset["ytrain"]), columns=columns
    )
    dataset["ytest"] = pd.DataFrame(
        dataset["out_post_trans"].transform(dataset["ytest"]), columns=columns
    )
    dataset["yvalid"] = pd.DataFrame(
        dataset["out_post_trans"].transform(dataset["yvalid"]), columns=columns
    )

    return dataset

def run_output_preprocess(params, dataset)

Expand source code

def run_output_preprocess(params, dataset):

    # data = {}
    # dataset["ytrain"] = np.log(dataset["ytrain"])
    # dataset["ytest"] = np.log(dataset["ytest"])
    # dataset["out_pre_trans"] = np.log
    return dataset

def run_split(params, dataset)

Expand source code

def run_split(params, dataset):
    Xtrain, Xvalid, ytrain, yvalid = train_test_split(
        dataset["Xtrain"],
        dataset["ytrain"],
        train_size=1 - params.valid_split,
        random_state=params.bootstrap_seed,
    )

    dataset["Xtrain"] = Xtrain
    dataset["Xvalid"] = Xvalid
    dataset["ytrain"] = ytrain
    dataset["yvalid"] = yvalid
    return dataset

def times_2_cycles(df: pandas.core.frame.DataFrame, time_types: List[str] = ['doy']) -> pandas.core.frame.DataFrame

Converts some times to a cyclic axis of x, y using sin and cos.

Converts the times to radians
Normalizes by the maximum of the time cycle
Applies the sin and cosine transformation
Drops original columns

Parameters

df : pd.DataFrame A dataframe with the values. The columns need to be one of the following ['doy', 'month', 'hour']

time_types : List of , e.g. ['doy', 'month', 'hour'] The time type to convert to a cycle doy - assumes 1 in 24 hours

Returns

df : pd.DataFrame: A dataframe with the converted values.

Example

times = ['doy'] df = time_2_cycle(df, times)

times = ['doy', 'month'] df = times_2_cycles(df, times)

Expand source code

def times_2_cycles(df: pd.DataFrame, time_types: List[str] = ["doy"]) -> pd.DataFrame:
    """Converts some times to a cyclic axis of x, y using sin and cos. 
    
    
    1. Converts the times to radians 
    2. Normalizes by the maximum of the time cycle
    3. Applies the sin and cosine transformation
    4. Drops original columns

    Parameters 
    ----------
    df : pd.DataFrame
        A dataframe with the values. The columns need to be one of the following
        ['doy', 'month', 'hour']
    
    time_types : List of , e.g. ['doy', 'month', 'hour']
        The time type to convert to a cycle
        doy - assumes 1 in 24 hours
    
    Returns
    -------
    df : pd.DataFrame
        A dataframe with the converted values.

    Example
    -------
    >> times = ['doy']
    >> df = time_2_cycle(df, times)

    >> times = ['doy', 'month']
    >> df = times_2_cycles(df, times)
    """
    deg2rad = 2 * np.pi

    cols = X.columns.tolist()

    if "doy" in time_types and "doy" in cols:

        const = 365.0  # number of days in a year

        df["doy_sin"] = np.sin(df["doy"] * deg2rad / const)
        df["doy_cos"] = np.cos(df["doy"] * deg2rad / const)

        df = df.drop("doy", axis=1)

    if "month" in time_types and "month" in cols:

        const = 12  # number of months in a year

        df["month_sin"] = np.sin((df["month"] - 1) * deg2rad / const)
        df["month_cos"] = np.cos((df["month"] - 1) * deg2rad / const)

        df = df.drop("month", axis=1)

    if "hour" in time_types and "hour" in cols:

        const = 24.0  # number of days in a year

        df["hour_sin"] = np.sin(df["hour"] * deg2rad / const)
        df["hour_cos"] = np.cos(df["hour"] * deg2rad / const)

        df = df.drop("hour", axis=1)

    # drop original column

    return df

Classes

class CycleTransform (time_types: List[str] = ['doy'])

Converts some times to a cyclic axis of x, y using sin and cos.

Converts the times to radians
Normalizes by the maximum of the time cycle
Applies the sin and cosine transformation
Drops original columns

Parameters

time_types : List of , e.g. ['doy', 'month', 'hour'] The time type to convert to a cycle doy - assumes 1 in 24 hours

Example

times = ['doy'] X = CycleTransform(times).fit_transform(X)

times = ['doy', 'month'] X = CycleTransform(times).fit_transform(X)

Expand source code

class CycleTransform(BaseEstimator, TransformerMixin):
    """Converts some times to a cyclic axis of x, y using sin and cos. 
    
    
    1. Converts the times to radians 
    2. Normalizes by the maximum of the time cycle
    3. Applies the sin and cosine transformation
    4. Drops original columns

    Parameters 
    ----------
    time_types : List of , e.g. ['doy', 'month', 'hour']
        The time type to convert to a cycle
        doy - assumes 1 in 24 hours

    Example
    -------
    >> times = ['doy']
    >> X = CycleTransform(times).fit_transform(X)

    >> times = ['doy', 'month']
    >> X = CycleTransform(times).fit_transform(X)
    """

    def __init__(self, time_types: List[str] = ["doy"]):
        self.time_types = time_types

    def fit(self, X, y=None):
        """For compatibility reasons."""
        return self

    def transform(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        """
        Parameters 
        ----------
        X : pd.DataFrame
            A dataframe with the values. The columns need to be one of the following
            ['doy', 'month', 'hour']
        
        y : pd.DataFrame, Optional
            Does nothing. Only for compatibility reasons.
        
        Returns
        -------
        df : pd.DataFrame
            A dataframe with the converted values.
        """
        deg2rad = 2 * np.pi

        cols = X.columns.tolist()

        if "doy" in self.time_types and "doy" in cols:

            const = 365.0  # number of days in a year

            X["doy_sin"] = np.sin(X["doy"] * deg2rad / const)
            X["doy_cos"] = np.cos(X["doy"] * deg2rad / const)

            X = X.drop("doy", axis=1)

        if "month" in self.time_types and "month" in cols:

            const = 12  # number of months in a year

            X["month_sin"] = np.sin((X["month"] - 1) * deg2rad / const)
            X["month_cos"] = np.cos((X["month"] - 1) * deg2rad / const)

            X = X.drop("month", axis=1)

        if "hour" in self.time_types and "hour" in cols:

            const = 24.0  # number of days in a year

            X["hour_sin"] = np.sin(X["hour"] * deg2rad / const)
            X["hour_cos"] = np.cos(X["hour"] * deg2rad / const)

            X = X.drop("hour", axis=1)

        # drop original column

        return X

Ancestors

sklearn.base.BaseEstimator
sklearn.base.TransformerMixin

Methods

def fit(self, X, y=None)

For compatibility reasons.

Expand source code

def fit(self, X, y=None):
    """For compatibility reasons."""
    return self

def transform(self, X: pandas.core.frame.DataFrame, y: Union[pandas.core.frame.DataFrame, NoneType] = None)

Parameters

X : pd.DataFrame A dataframe with the values. The columns need to be one of the following ['doy', 'month', 'hour']

y : pd.DataFrame, Optional Does nothing. Only for compatibility reasons.

Returns

df : pd.DataFrame: A dataframe with the converted values.

Expand source code

def transform(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
    """
    Parameters 
    ----------
    X : pd.DataFrame
        A dataframe with the values. The columns need to be one of the following
        ['doy', 'month', 'hour']
    
    y : pd.DataFrame, Optional
        Does nothing. Only for compatibility reasons.
    
    Returns
    -------
    df : pd.DataFrame
        A dataframe with the converted values.
    """
    deg2rad = 2 * np.pi

    cols = X.columns.tolist()

    if "doy" in self.time_types and "doy" in cols:

        const = 365.0  # number of days in a year

        X["doy_sin"] = np.sin(X["doy"] * deg2rad / const)
        X["doy_cos"] = np.cos(X["doy"] * deg2rad / const)

        X = X.drop("doy", axis=1)

    if "month" in self.time_types and "month" in cols:

        const = 12  # number of months in a year

        X["month_sin"] = np.sin((X["month"] - 1) * deg2rad / const)
        X["month_cos"] = np.cos((X["month"] - 1) * deg2rad / const)

        X = X.drop("month", axis=1)

    if "hour" in self.time_types and "hour" in cols:

        const = 24.0  # number of days in a year

        X["hour_sin"] = np.sin(X["hour"] * deg2rad / const)
        X["hour_cos"] = np.cos(X["hour"] * deg2rad / const)

        X = X.drop("hour", axis=1)

    # drop original column

    return X

class GeoCartTransform

Transforms geo coordinates (lat, lon) to cartesian coordinates (x, y, z).

Example

df = geo_2_cartesian(df)

Expand source code

class GeoCartTransform(BaseEstimator, TransformerMixin):
    """Transforms geo coordinates (lat, lon) to cartesian coordinates
    (x, y, z).
    
    Example
    -------
    >> df = geo_2_cartesian(df)
    """

    def __init__(self):
        pass

    def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """    
        Parameters 
        ----------
        df : pd.DataFrame
            A dataframe with the geo coordinates values. The columns need to 
            have the following ['lat', 'lon]
        
        Returns
        -------
        df : pd.DataFrame
            A dataframe with the converted values.
        """
        cols = X.columns.tolist()

        if "lat" not in cols or "lon" not in cols:
            print("lat,lon columns not present in X.")
            return X

        deg2rad = np.pi / 180.0

        # transform from degrees to radians
        X["lat"] *= deg2rad
        X["lon"] *= deg2rad

        # From Geo coords to cartesian coords
        X["x"] = np.cos(X["lat"]) * np.cos(X["lon"])
        X["y"] = np.cos(X["lat"]) * np.sin(X["lon"])
        X["z"] = np.sin(X["lat"])

        # drop original columns
        X = X.drop(["lat", "lon"], axis=1)

        return X

Ancestors

sklearn.base.BaseEstimator
sklearn.base.TransformerMixin

Methods

def fit(self, X: pandas.core.frame.DataFrame, y: Union[pandas.core.frame.DataFrame, NoneType] = None)

Expand source code

def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
    return self

def transform(self, X: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame

Parameters

df : pd.DataFrame A dataframe with the geo coordinates values. The columns need to have the following ['lat', 'lon]

Returns

df : pd.DataFrame: A dataframe with the converted values.

Expand source code

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """    
    Parameters 
    ----------
    df : pd.DataFrame
        A dataframe with the geo coordinates values. The columns need to 
        have the following ['lat', 'lon]
    
    Returns
    -------
    df : pd.DataFrame
        A dataframe with the converted values.
    """
    cols = X.columns.tolist()

    if "lat" not in cols or "lon" not in cols:
        print("lat,lon columns not present in X.")
        return X

    deg2rad = np.pi / 180.0

    # transform from degrees to radians
    X["lat"] *= deg2rad
    X["lon"] *= deg2rad

    # From Geo coords to cartesian coords
    X["x"] = np.cos(X["lat"]) * np.cos(X["lon"])
    X["y"] = np.cos(X["lat"]) * np.sin(X["lon"])
    X["z"] = np.sin(X["lat"])

    # drop original columns
    X = X.drop(["lat", "lon"], axis=1)

    return X

class ProcessParams

Expand source code

class ProcessParams:
    n_components = 5
    valid_split = 0.2
    standardize = "before"
    seed = 123
    bootstrap_seed = 123

Class variables

var bootstrap_seed
var n_components
var seed
var standardize
var valid_split