Source code for teras._src.layers.ft_transformer.feature_tokenizer
import keras
import numpy as np
from teras._src.api_export import teras_export
from keras import ops
from keras.backend import floatx
[docs]
@teras_export("teras.layers.FTTransformerFeatureTokenizer")
class FTTransformerFeatureTokenizer(keras.layers.Layer):
"""
Feature Tokenizer layer based on FT-Transformer architecture
proposed in the "Revisiting Deep Learning Models for Tabular Data"
paper.
Reference(s):
https://arxiv.org/abs/2106.11959
Args:
cardinalities: list, a list cardinalities of all the features
in the dataset in the same order as the features' occurrence.
For numerical features, use any value <=0 as indicator at
the corresponding index.
You can use the `compute_cardinalities` function from
`teras.utils` package for this purpose.
embedding_dim: int, dimensionality of the embeddings
Shapes:
Input Shape: `(batch_size, num_features)`
Output Shape: `(batch_size, num_features, embedding_dim)`
"""
[docs]
def __init__(self,
cardinalities: list,
embedding_dim: int,
**kwargs):
super().__init__(**kwargs)
self.cardinalities = cardinalities
self.embedding_dim = embedding_dim
self._cardinalities_arr = np.array(cardinalities)
self._continuous_idx = np.flatnonzero(
self._cardinalities_arr == 0)
self._categorical_idx = np.flatnonzero(
self._cardinalities_arr != 0)
# we add an extra token for missing value
num_special_tokens = 1
self._total_tokens = sum(self.cardinalities) + num_special_tokens
categorical_cardinalities = self._cardinalities_arr[self._categorical_idx]
categorical_cardinalities = ops.pad(
categorical_cardinalities,
(1, 0),
constant_values=num_special_tokens,
)
self._category_offset = ops.cumsum(categorical_cardinalities)[:-1]
self._category_offset = ops.cast(self._category_offset, floatx())
def build(self, input_shape=None):
self.categorical_embeddings = self.add_weight(
shape=(self._total_tokens, self.embedding_dim),
initializer="random_normal",
trainable=True,
)
num_continuous_features = len(self._continuous_idx)
self.continuous_embeddings = self.add_weight(
shape=(num_continuous_features,
self.embedding_dim),
trainable=True,
)
self.bias = self.add_weight(
shape=(len(self.cardinalities),),
initializer="random_normal",
trainable=True,
)
def call(self, inputs):
categorical = ops.take(inputs,
indices=self._categorical_idx,
axis=1)
categorical += self._category_offset
continuous = ops.take(inputs,
indices=self._continuous_idx,
axis=1)
categorical = ops.cast(categorical, "int32")
categorical = ops.take(self.categorical_embeddings,
indices=categorical,
axis=0)
continuous = (ops.expand_dims(continuous, -1) *
ops.expand_dims(self.continuous_embeddings, 0))
out = ops.concatenate([continuous, categorical], axis=1)
out += ops.reshape(self.bias, (1, ops.shape(self.bias)[0], 1))
return out
def compute_output_shape(self, input_shape):
return input_shape + (self.embedding_dim,)
def get_config(self):
config = super().get_config()
config.update({
"cardinalities": self.cardinalities,
"embedding_dim": self.embedding_dim
})
return config