Source code for teras._src.utils

from keras import ops
import pandas as pd
import numpy as np
from warnings import warn
from teras._src.typing import DataFrameOrNdArray
from teras._src.api_export import teras_export


[docs] @teras_export("teras.utils.compute_cardinalities") def compute_cardinalities(x, categorical_idx: list, ordinal_encoded: bool = True): """ Compute cardinalities for features in the given dataset/dataframe. For numerical features, 0 is used as a placeholder. Args: x: Input dataset or dataframe. categorical_idx: list, a list of indices of categorical features in the given dataset. ordinal_encoded: `bool`, Whether the categorical values have been ordinal encoded. Defaults to True. Returns: A 1d numpy array of cardinalities of all features. For numerical features, a value of 0 is used. """ if isinstance(x, pd.DataFrame): x = x.values cardinalities = np.array([], dtype=np.uint16) for idx in range(ops.shape(x)[1]): if idx in categorical_idx: feature = ops.convert_to_numpy(x[:, idx]) if ordinal_encoded: num_categories = np.max(feature) + 1 else: num_categories = len(np.unique(feature)) cardinalities = np.append(cardinalities, num_categories) else: # it's a numerical feature, in which case we append 0 cardinalities = np.append(cardinalities, 0) return cardinalities
[docs] @teras_export("teras.utils.get_metadata_for_embedding") def get_metadata_for_embedding(dataframe: pd.DataFrame, categorical_features=None, numerical_features=None): # TODO: # Add support for TensorFlow datasets and PyTorch DataLoaders/Datasets """ Utility function that create metadata for features in a given dataframe required by the Categorical and Numerical embedding layers in Teras. For numerical features, it maps each feature name to feature index. For categorical features, it maps each feature name to a tuple of feature index and vocabulary of words in that categorical feature. This metadata is usually required by the architectures that create embeddings of Numerical or Categorical features, such as TabTransformer, TabNet, FT-Transformer, etc. Args: dataframe: Input dataframe categorical_features: List of names of categorical features in the input dataset numerical_features: List of names of categorical features in the input dataset Returns: A dictionary which contains sub-dictionaries for categorical and numerical features where categorical dictionary is a mapping of categorical feature names to a tuple of feature indices and the lists of unique values (vocabulary) in them, while numerical dictionary is a mapping of numerical feature names to their indices {feature_name: (feature_idx, vocabulary)} for feature in categorical features. {feature_name: feature_idx} for feature in numerical features. """ if categorical_features is None and numerical_features is None: raise ValueError( "Both `categorical_features` and `numerical_features` cannot " "be None at the same time. " "You must pass value for at least one of them. " f"Received, `categorical_features`: {categorical_features}, " f"`numerical_features`: {numerical_features}") categorical_features = [] if categorical_features is None else categorical_features numerical_features = [] if numerical_features is None else numerical_features features_meta_data = {} categorical_features_metadata = {} numerical_features_metadata = {} # Verify all specified features are present in the dataframe specified_columns = set(numerical_features).union(set(categorical_features)) not_found_in_dataframe = specified_columns - set(dataframe.columns) if len(not_found_in_dataframe) > 0: raise ValueError( f"Following specified features not found in the dataframe, " f"{not_found_in_dataframe}") for idx, col in enumerate(dataframe.columns): if categorical_features is not None and col in categorical_features: vocabulary = sorted(list(dataframe[col].unique())) categorical_features_metadata.update({col: (idx, vocabulary)}) elif numerical_features is not None and col in numerical_features: numerical_features_metadata.update({col: idx}) features_meta_data["categorical"] = categorical_features_metadata features_meta_data["numerical"] = numerical_features_metadata return features_meta_data
[docs] @teras_export("teras.utils.convert_tf_dict_to_array_tensor") def convert_tf_dict_to_array_tensor(dict_tensor): """ Converts a batch of data taken from tensorflow dictionary format dataset to array format. Args: dict_tensor: A batch of data taken from tensorflow dictionary format dataset. Returns: Array format data. """ if not isinstance(dict_tensor, dict): warn("Given tensor is not in dictionary format." "Hence no processing will be applied. \n" f"Expected type: {dict}, Received type: {type(dict_tensor)}") return feature_names = dict_tensor.keys() array_tensor = [ops.expand_dims(dict_tensor[feature_name], axis=1) for feature_name in feature_names] array_tensor = ops.concatenate(array_tensor, axis=1) return array_tensor
[docs] @teras_export("teras.utils.inject_missing_values") def inject_missing_values(x: DataFrameOrNdArray, miss_rate=0.1 ): """ Injects missing (np.nan) values in the given dataframe or ndarray. Args: x: A pandas dataframe or ndarray. miss_rate: The fraction of missing values that should be introduced. Should be between 0-1. Defaults to 0.1 Returns: Data with missing values. Example: ```python data = np.arange(1000).reshape(50, 20) data = inject_missing_values(data, miss_rate=0.2, return_dataframe=False) ``` """ x_with_missing_data = x.copy() is_dataframe = isinstance(x_with_missing_data, pd.DataFrame) if is_dataframe: x_with_missing_data = x_with_missing_data.values mask = np.random.binomial(1, 1-miss_rate, size=x.shape) x_with_missing_data[mask == 0] = np.nan if is_dataframe: x_with_missing_data = pd.DataFrame(x_with_missing_data, columns=x.columns) return x_with_missing_data
[docs] @teras_export("teras.utils.inject_missing_values") def generate_fake_gemstone_data(num_samples: int = 16): """ Generate fake gemstone like data of specified num_samples. Args: num_samples: Number of samples to generate Returns: A pandas DataFrame of fake gemstone like data. """ fake_gem_df = pd.DataFrame({ "cut": np.random.randint(low=0, high=3, size=(num_samples,)), "color": np.random.randint(low=0, high=5, size=(num_samples,)), "clarity": np.random.randint(low=0, high=4, size=(num_samples,)), "depth": np.random.randint(low=0, high=100, size=(num_samples,)), "table": np.random.randint(low=0, high=100, size=(num_samples,)) }) fake_gem_df = fake_gem_df.astype(np.float32) return fake_gem_df
[docs] @teras_export("teras.utils.clean_reloaded_config_data") def clean_reloaded_config_data(x): """ Cleans reloaded dictionary/list config data in the `from_config` method. Args: x: dict or list to clean. """ if not isinstance(x, (dict, list)): return x if isinstance(x, dict): if "config" in x.keys(): return x["config"]["value"] for key, value in x.items(): x[key] = clean_reloaded_config_data(value) return x if isinstance(x, list): for i, value in enumerate(x): x[i] = clean_reloaded_config_data(value) return x