Source code for teras._src.layers.transformer.feedforward

import keras
from teras._src.api_export import teras_export
from teras._src.typing import ActivationType


[docs] @teras_export("teras.layers.TransformerFeedForward") class TransformerFeedForward(keras.layers.Layer): """ Transformer Feed Forward layer as proposed in the original Transformers architecture, in the paper,"Attention is all you need", with a slight addition of optional `Dropout` layer. Reference(s): https://arxiv.org/abs/1706.03762 Args: embedding_dim: int, dimensionality of embeddings being used in the model hidden_dim: int, hidden dimensionality to use. By default, it is four-times of the `embedding_dim`. activation: str or callable, activation function to use for the inner linear layer. Defaults to "relu", dropout: float, dropout rate to use for the dropout layer that is applied in between the two linear layer. Defaults to 0., because the original transformer architecture doesn't employ a `Dropout` layer. """
[docs] def __init__(self, embedding_dim: int, hidden_dim: int = None, activation: ActivationType = "relu", dropout: float = 0., **kwargs): super().__init__(**kwargs) self.embedding_dim = embedding_dim self.hidden_dim = embedding_dim * 4 if hidden_dim is None else hidden_dim self.activation = activation self.dropout = dropout self.inner = keras.layers.Dense(self.hidden_dim, activation=self.activation, name="feedforward_inner") self.outer = keras.layers.Dense(self.embedding_dim, name="feedforward_outer") self.dropout_layer = keras.layers.Dropout( self.dropout, name="feedforward_dropout")
def build(self, input_shape): self.inner.build(input_shape) hidden_shape = input_shape[:-1] + (self.hidden_dim,) self.outer.build(hidden_shape) def call(self, inputs): x = self.inner(inputs) x = self.dropout_layer(x) return self.outer(x) def get_config(self): config = super().get_config() config.update({ "embedding_dim": self.embedding_dim, "hidden_dim": self.hidden_dim, "activation": self.activation, "dropout": self.dropout }) return config