Module src.features.pca_features

Expand source code
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator
from typing import Tuple, Union, List, Optional
import pandas as pd
import numpy as np


def transform_all(
    dfs: List[pd.DataFrame],
    pca_model: Optional[List[pd.DataFrame]] = None,
    n_components: int = 10,
    random_state: int = 123,
) -> Tuple[pd.DataFrame, BaseEstimator]:
    """Applies a PCA transform on all of the variables
    concatenated."""
    X = pd.concat(dfs, axis=1)

    # perform PCA transformation
    if pca_model == None:
        pca_model = PCA(n_components=n_components, random_state=random_state)

        # fit PCA model
        X = pca_model.fit_transform(X)

        # meta data
        columns = [f"pc_{icomponent}" for icomponent in range(n_components)]
        X = pd.DataFrame(X, columns=columns)
        pca_model.columns = columns

        return X, pca_model
    else:
        X = pca_model.transform(X)
        if hasattr(pca_model, "columns"):
            X = pd.DataFrame(X, columns=pca_model.columns)
        return X


def transform_individual(
    dfs: List[pd.DataFrame],
    n_components: int = 10,
    random_state: int = 123,
    columns: Optional[List[str]] = None,
    **kwargs: Tuple[int, str, bool, float],
) -> Tuple[pd.DataFrame, BaseEstimator]:
    """Applies a PCA transform on the list of dataframes concatenated.
    
    Parameters
    ----------
    dfs: List[pd.DataFrame]
        a list of pandas dataframes to perform the PCA transformation

    random_state: int, default=123
        the random state for the PCA transformations

    columns : List[str]
        the suffix added to the column names

    kwargs: Tuple[int, str, bool, float]
        some kwargs for the PCA transformation

    Returns
    -------
    X: List[pd.DataFrame]
        a list of pandas dataframes
    

    """
    # # get column names
    # columns = [df.columns for df in dfs]
    # perform PCA transformation
    pca_models = [
        PCA(n_components=n_components, random_state=random_state, **kwargs)
    ] * len(dfs)

    # fit PCA model
    dfs = [pca_model.fit_transform(df) for df, pca_model in zip(dfs, pca_models)]

    # add metadata
    if columns is not None:
        dfs = [
            idf.add_suffix(f"{iname}_pc{pc_comp+1}")
            for pc_comp, (idf, iname) in enumerate(zip(dfs, columns))
        ]

    return dfs, pca_models

Functions

def transform_all(dfs: List[pandas.core.frame.DataFrame], pca_model: Union[List[pandas.core.frame.DataFrame], NoneType] = None, n_components: int = 10, random_state: int = 123) -> Tuple[pandas.core.frame.DataFrame, sklearn.base.BaseEstimator]

Applies a PCA transform on all of the variables concatenated.

Expand source code
def transform_all(
    dfs: List[pd.DataFrame],
    pca_model: Optional[List[pd.DataFrame]] = None,
    n_components: int = 10,
    random_state: int = 123,
) -> Tuple[pd.DataFrame, BaseEstimator]:
    """Applies a PCA transform on all of the variables
    concatenated."""
    X = pd.concat(dfs, axis=1)

    # perform PCA transformation
    if pca_model == None:
        pca_model = PCA(n_components=n_components, random_state=random_state)

        # fit PCA model
        X = pca_model.fit_transform(X)

        # meta data
        columns = [f"pc_{icomponent}" for icomponent in range(n_components)]
        X = pd.DataFrame(X, columns=columns)
        pca_model.columns = columns

        return X, pca_model
    else:
        X = pca_model.transform(X)
        if hasattr(pca_model, "columns"):
            X = pd.DataFrame(X, columns=pca_model.columns)
        return X
def transform_individual(dfs: List[pandas.core.frame.DataFrame], n_components: int = 10, random_state: int = 123, columns: Union[List[str], NoneType] = None, **kwargs: Tuple[int, str, bool, float]) -> Tuple[pandas.core.frame.DataFrame, sklearn.base.BaseEstimator]

Applies a PCA transform on the list of dataframes concatenated.

Parameters

dfs : List[pd.DataFrame]
a list of pandas dataframes to perform the PCA transformation
random_state : int, default=123
the random state for the PCA transformations
columns : List[str]
the suffix added to the column names
kwargs : Tuple[int, str, bool, float]
some kwargs for the PCA transformation

Returns

X : List[pd.DataFrame]
a list of pandas dataframes
Expand source code
def transform_individual(
    dfs: List[pd.DataFrame],
    n_components: int = 10,
    random_state: int = 123,
    columns: Optional[List[str]] = None,
    **kwargs: Tuple[int, str, bool, float],
) -> Tuple[pd.DataFrame, BaseEstimator]:
    """Applies a PCA transform on the list of dataframes concatenated.
    
    Parameters
    ----------
    dfs: List[pd.DataFrame]
        a list of pandas dataframes to perform the PCA transformation

    random_state: int, default=123
        the random state for the PCA transformations

    columns : List[str]
        the suffix added to the column names

    kwargs: Tuple[int, str, bool, float]
        some kwargs for the PCA transformation

    Returns
    -------
    X: List[pd.DataFrame]
        a list of pandas dataframes
    

    """
    # # get column names
    # columns = [df.columns for df in dfs]
    # perform PCA transformation
    pca_models = [
        PCA(n_components=n_components, random_state=random_state, **kwargs)
    ] * len(dfs)

    # fit PCA model
    dfs = [pca_model.fit_transform(df) for df, pca_model in zip(dfs, pca_models)]

    # add metadata
    if columns is not None:
        dfs = [
            idf.add_suffix(f"{iname}_pc{pc_comp+1}")
            for pc_comp, (idf, iname) in enumerate(zip(dfs, columns))
        ]

    return dfs, pca_models