Source code for teras._src.utils

from keras import ops
import pandas as pd
import numpy as np
from warnings import warn
from teras._src.typing import DataFrameOrNdArray
from teras._src.api_export import teras_export



[docs]
@teras_export("teras.utils.compute_cardinalities")
def compute_cardinalities(x, categorical_idx: list,
                          ordinal_encoded: bool = True):
    """
    Compute cardinalities for features in the given dataset/dataframe.
    For numerical features, 0 is used as a placeholder.

    Args:
        x: Input dataset or dataframe.
        categorical_idx: list, a list of indices of categorical features
            in the given dataset.
        ordinal_encoded: `bool`, Whether the categorical values have been
            ordinal encoded. Defaults to True.

    Returns:
        A 1d numpy array of cardinalities of all features.
        For numerical features, a value of 0 is used.
    """
    if isinstance(x, pd.DataFrame):
        x = x.values

    cardinalities = np.array([], dtype=np.uint16)
    for idx in range(ops.shape(x)[1]):
        if idx in categorical_idx:
            feature = ops.convert_to_numpy(x[:, idx])
            if ordinal_encoded:
                num_categories = np.max(feature) + 1
            else:
                num_categories = len(np.unique(feature))
            cardinalities = np.append(cardinalities, num_categories)
        else:
            # it's a numerical feature, in which case we append 0
            cardinalities = np.append(cardinalities, 0)
    return cardinalities




[docs]
@teras_export("teras.utils.get_metadata_for_embedding")
def get_metadata_for_embedding(dataframe: pd.DataFrame,
                               categorical_features=None,
                               numerical_features=None):
    # TODO:
    #   Add support for TensorFlow datasets and PyTorch DataLoaders/Datasets
    """
    Utility function that create metadata for features in a given dataframe
    required by the Categorical and Numerical embedding layers in Teras.
    For numerical features, it maps each feature name to feature index.
    For categorical features, it maps each feature name to a tuple of
    feature index and vocabulary of words in that categorical feature.
    This metadata is usually required by the architectures that create
    embeddings of Numerical or Categorical features,
    such as TabTransformer, TabNet, FT-Transformer, etc.

    Args:
        dataframe: Input dataframe
        categorical_features: List of names of categorical features in the
            input dataset
        numerical_features: List of names of categorical features in the
            input dataset

    Returns:
        A dictionary which contains sub-dictionaries for categorical and
        numerical features where categorical dictionary is a mapping of
        categorical feature names to a tuple of feature indices and the
        lists of unique values (vocabulary) in them, while numerical
        dictionary is a mapping of numerical feature names to their indices
        {feature_name: (feature_idx, vocabulary)} for feature in categorical features.
        {feature_name: feature_idx} for feature in numerical features.
    """
    if categorical_features is None and numerical_features is None:
        raise ValueError(
            "Both `categorical_features` and `numerical_features` cannot "
            "be None at the same time. "
            "You must pass value for at least one of them. "
            f"Received, `categorical_features`: {categorical_features}, "
            f"`numerical_features`: {numerical_features}")
    categorical_features = [] if categorical_features is None else categorical_features
    numerical_features = [] if numerical_features is None else numerical_features
    features_meta_data = {}
    categorical_features_metadata = {}
    numerical_features_metadata = {}
    # Verify all specified features are present in the dataframe
    specified_columns = set(numerical_features).union(set(categorical_features))
    not_found_in_dataframe = specified_columns - set(dataframe.columns)
    if len(not_found_in_dataframe) > 0:
        raise ValueError(
            f"Following specified features not found in the dataframe, "
            f"{not_found_in_dataframe}")
    for idx, col in enumerate(dataframe.columns):
        if categorical_features is not None and col in categorical_features:
            vocabulary = sorted(list(dataframe[col].unique()))
            categorical_features_metadata.update({col: (idx, vocabulary)})
        elif numerical_features is not None and col in numerical_features:
            numerical_features_metadata.update({col: idx})

    features_meta_data["categorical"] = categorical_features_metadata
    features_meta_data["numerical"] = numerical_features_metadata
    return features_meta_data




[docs]
@teras_export("teras.utils.convert_tf_dict_to_array_tensor")
def convert_tf_dict_to_array_tensor(dict_tensor):
    """
    Converts a batch of data taken from tensorflow dictionary format
    dataset to array format.
    Args:
        dict_tensor: A batch of data taken from tensorflow dictionary
        format dataset.

    Returns:
        Array format data.
    """
    if not isinstance(dict_tensor, dict):
        warn("Given tensor is not in dictionary format."
             "Hence no processing will be applied. \n"
             f"Expected type: {dict}, Received type: {type(dict_tensor)}")
        return

    feature_names = dict_tensor.keys()
    array_tensor = [ops.expand_dims(dict_tensor[feature_name], axis=1)
                    for feature_name in feature_names]
    array_tensor = ops.concatenate(array_tensor, axis=1)
    return array_tensor




[docs]
@teras_export("teras.utils.inject_missing_values")
def inject_missing_values(x: DataFrameOrNdArray,
                          miss_rate=0.1
                          ):
    """
    Injects missing (np.nan) values in the given dataframe or ndarray.

    Args:
        x: A pandas dataframe or ndarray.
        miss_rate: The fraction of missing values that should be introduced.
            Should be between 0-1. Defaults to 0.1

    Returns:
        Data with missing values.

    Example:
        ```python
        data = np.arange(1000).reshape(50, 20)
        data = inject_missing_values(data, miss_rate=0.2, return_dataframe=False)
        ```
    """
    x_with_missing_data = x.copy()
    is_dataframe = isinstance(x_with_missing_data, pd.DataFrame)

    if is_dataframe:
        x_with_missing_data = x_with_missing_data.values

    mask = np.random.binomial(1, 1-miss_rate, size=x.shape)
    x_with_missing_data[mask == 0] = np.nan

    if is_dataframe:
        x_with_missing_data = pd.DataFrame(x_with_missing_data,
                                           columns=x.columns)
    return x_with_missing_data




[docs]
@teras_export("teras.utils.inject_missing_values")
def generate_fake_gemstone_data(num_samples: int = 16):
    """
    Generate fake gemstone like data of specified num_samples.

    Args:
        num_samples:
            Number of samples to generate

    Returns:
        A pandas DataFrame of fake gemstone like data.
    """
    fake_gem_df = pd.DataFrame({
        "cut": np.random.randint(low=0, high=3, size=(num_samples,)),
        "color": np.random.randint(low=0, high=5, size=(num_samples,)),
        "clarity": np.random.randint(low=0, high=4, size=(num_samples,)),
        "depth": np.random.randint(low=0, high=100, size=(num_samples,)),
        "table": np.random.randint(low=0, high=100, size=(num_samples,))
    })
    fake_gem_df = fake_gem_df.astype(np.float32)
    return fake_gem_df




[docs]
@teras_export("teras.utils.clean_reloaded_config_data")
def clean_reloaded_config_data(x):
    """
    Cleans reloaded dictionary/list config data in the `from_config` method.

    Args:
        x: dict or list to clean.
    """
    if not isinstance(x, (dict, list)):
        return x
    if isinstance(x, dict):
        if "config" in x.keys():
            return x["config"]["value"]
        for key, value in x.items():
            x[key] = clean_reloaded_config_data(value)
        return x
    if isinstance(x, list):
        for i, value in enumerate(x):
            x[i] = clean_reloaded_config_data(value)
        return x