Source code for teras._src.preprocessing.data_samplers.tvae

try:
    import tensorflow as tf
except:
    raise ImportError(
        "You need tensorflow to use `TVAEataSampler`. "
        "Install it using `pip install tensorflow`"
    )
import numpy as np
from teras._src.preprocessing.data_samplers.ctgan import CTGANDataSampler as _BaseDataSampler
from teras._src.typing import FeaturesNamesType



[docs]
class TVAEDataSampler(_BaseDataSampler):
    """
    TVAEDataSampler class for `TVAE` architecture.
    It subclasses the `CTGANDataSampler` class from `CTGAN` architecture.

    The two classes share much functionality since `TVAE` and `CTGAN`
    are proposed in the same paper and almost all preprocessing
    for both is same.
    There are, however, are a few differences in the `get_dataset`
    and `generator` methods, hence this new subclassed class.

    Reference(s):
        https://arxiv.org/abs/1907.00503
        https://github.com/sdv-dev/CTGAN/

    Args:
        metadata: dict, A dictionary of metadata computed during data
            transformation. You can access it from the `.get_metadata()` of
            `TVAEDataTransformer` instance.
        categorical_features: list, List of categorical features names.
            CTGAN requires dataset to have at least one categorical feature,
            if your dataset doesn't contain any categorical features,
            consider using some other generative model.
        continuous_features: list, List of continuous features names
        batch_size: int, default 512,
            Batch size to use for the dataset.
        seed: int, Seed for random ops.
    """

[docs]
    def __init__(self,
                 metadata: dict,
                 categorical_features: FeaturesNamesType = None,
                 continuous_features: FeaturesNamesType = None,
                 batch_size: int = 512,
                 seed: int = 1337,
                 ):
        super().__init__(metadata=metadata,
                         categorical_features=categorical_features,
                         continuous_features=continuous_features,
                         batch_size=batch_size,
                         seed=seed)


    def get_dataset(self,
                    x_transformed,
                    x_original=None):
        """
        Args:
            x_transformed: Dataset transformed using `TVAEDataTransformer` class
            x_original: Original Dataset - a pandas DataFrame.
                It is used for computing categorical values' probabilities
                for later sampling.
        Returns:
            Returns a tensorflow dataset that utilizes the `generator` method
            to create batches of data. This way user can just pass the dataset
            object to the fit method of the model and each batch generated
            will satisfy all out requirements of sampling
        """
        self.num_samples, self.data_dim = x_transformed.shape
        # adapting the approach from the official implementation
        # to sample evenly across the categories to combat imbalance
        row_idx_raw = [x_original.groupby(feature).groups
                       for feature in self.categorical_features]
        self.row_idx_by_categories = [
            [values.to_list() for values in feat.values()]
            for feat in row_idx_raw]

        dataset = tf.data.Dataset.from_generator(
            self.generator,
            output_signature=(
                tf.TensorSpec(
                    shape=(self.batch_size, tf.shape(x_transformed)[1]),
                    name="data_batch")
                              ),
            args=(x_transformed,),
        )
        return dataset

    def generator(self, x_transformed):
        """
        Used to create a tensorflow dataset.

        Args:
            x_transformed: Dataset transformed by the `TVAEDataTransformer`
                class.

        Returns:
            A batch of data
        """
        num_steps_per_epoch = self.num_samples // self.batch_size
        for _ in range(num_steps_per_epoch):
            selected_cat_features_idx = self._np_rng.choice(
                np.arange(self._num_categorical_features),
                size=self.batch_size
            )
            # NOTE: We've precomputed the probabilities in the DataTransformer
            # class for each feature already to speed things up.
            selected_features_categories_probs = self._features_categories_probs[selected_cat_features_idx]

            # Choose random values idx for features
            selected_cat_values_idx = np.array([
                self._np_rng.choice(np.arange(len(probs)),
                                    p=probs)
                for probs in selected_features_categories_probs]
            ).astype(np.int32)

            # the official implementation uses actual indices during the
            # sample_cond_vector method but uses the shuffled version in
            # sampling data, so we're gonna do just that.
            shuffled_idx = np.arange(self.batch_size)
            self._np_rng.shuffle(shuffled_idx)

            shuffled_cat_features_idx = selected_cat_features_idx[shuffled_idx]
            shuffled_values_idx = selected_cat_values_idx[shuffled_idx]

            sample_idx = []
            for feat_id, val_id in zip(shuffled_cat_features_idx,
                                       shuffled_values_idx):
                s_id = self._np_rng.choice(
                    self.row_idx_by_categories[np.squeeze(feat_id)][
                        np.squeeze(val_id)])
                sample_idx.append(np.squeeze(s_id))

            yield x_transformed[sample_idx]