Source code for teras._src.preprocessing.data_transformers.gain

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from teras._src.preprocessing.data_transformers.data_transformer import DataTransformer
from teras._src.typing import FeaturesNamesType
import json


[docs] class GAINDataTransformer(DataTransformer): """ GAINDataTransformer class that performs the required transformations on the raw dataset required by the GAIN architecture. Args: categorical_features: list, List of categorical features names in the dataset. Categorical features are encoded by ordinal encoder method. And then MinMax normalization is applied. continuous_features: list, List of numerical features names in the dataset. Numerical features are encoded using MinMax normalization. """
[docs] def __init__(self, categorical_features: FeaturesNamesType = None, continuous_features: FeaturesNamesType = None ): super().__init__() self.categorical_features = categorical_features self.continuous_features = continuous_features self._encoder = OrdinalEncoder() self._min_vals = None self._max_vals = None self._ordered_features_names_all = None self._fitted = False
def fit(self, x: pd.DataFrame): # min-max normalize the entire dataset regardless of the features types. # For categorical, first need to encode values ordinally x_temp = x.copy() if self.categorical_features is not None: self._encoder.fit(x[self.categorical_features]) x_temp[self.categorical_features] = self._encoder.transform( x[self.categorical_features]) self._min_vals = np.nanmin(x_temp, axis=0) self._max_vals = np.nanmax(x_temp, axis=0) self._fitted = True def transform(self, x: pd.DataFrame): """ Transforms the data (applying normalization etc) and returns a tensorflow dataset. It also stores the metadata of features that is used in the reverse transformation step. Args: x: Data to transform. Must be a pandas DataFrame. Returns: Transformed data. """ if not self._fitted: raise AssertionError( "You haven't yet fitted the DataTransformer. " "You must call the `fit` method before you can call the " "`transform` method. ") if not isinstance(x, pd.DataFrame): raise ValueError( "Only pandas dataframe is supported by DataTransformer class." f" But data of type {type(x)} was passed. " f"Please convert it to pandas dataframe before passing.") self._ordered_features_names_all = x.columns if self.categorical_features is not None: x[self.categorical_features] = self._encoder.transform( x[self.categorical_features]) x = (x - self._min_vals) / self._max_vals return x def inverse_transform(self, x): """ Inverse Transforms the transformed data. Args: x: Transformed Data. Returns: Pandas dataframe of data in its original scale """ if not isinstance(x, pd.DataFrame): x = pd.DataFrame(x, columns=self._ordered_features_names_all) if not self._fitted: raise AssertionError( "You haven't yet fitted the DataTransformer. " "You must call the `fit` method before you can call the " "`inverse_transform` method. ") # min-max transformation was applied to the whole dataset even to the # ordinal encoded categorical features, so first, undo that # transformation and then inverse transform categorical x = (x * self._max_vals) + self._min_vals if self.categorical_features is not None: x[self.categorical_features] = self._encoder.inverse_transform( x[self.categorical_features]) return x def save(self, filename): """ Saves the fitted state of `DataTransformer` instance for portability, in the `json` format. Args: filename: Filename or file path ending in `.json` extension. """ args = { "categorical_features": self.categorical_features, "continuous_features": self.continuous_features } attrs = { "_min_vals": list(self._min_vals), "_max_vals": list(self._max_vals), "_ordered_features_names_all": self._ordered_features_names_all, "_fitted": self._fitted } encoder_attrs = { "categories_": list(self._encoder.categories_) } state = { "args": args, "attrs": attrs, "encoder_attrs": encoder_attrs, } with open(filename, "w") as f: json.dump(state, f) @classmethod def load(cls, filename): """ Loads the saved state of `DataTransformer` from the `json` file. Args: filename: Filename or file path ending in `.json` extension. Returns: An instance of `GAINDataTransformer` with state stored in the `filename` json file. """ with open(filename, "r") as f: state = json.load(f) c = cls(**state.pop("params")) for name, value in state.pop("attrs"): c.__setattr__(name, value) for name, value in state.pop("encoder_attrs"): c._encoder.__setattr__(name, value) return c