Source code for encoders.transformer_encoder
# This code is heavily based on the code from MLPerf
# https://github.com/mlperf/reference/tree/master/translation/tensorflow
# /transformer
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
import tensorflow as tf
from six.moves import range
from open_seq2seq.encoders import Encoder
from open_seq2seq.parts.transformer import attention_layer, ffn_layer, utils, \
embedding_layer
from open_seq2seq.parts.transformer.common import PrePostProcessingWrapper, \
LayerNormalization, Transformer_BatchNorm
[docs]class TransformerEncoder(Encoder):
"""Transformer model encoder"""
[docs] @staticmethod
def get_required_params():
"""Static method with description of required parameters.
Returns:
dict:
Dictionary containing all the parameters that **have to** be
included into the ``params`` parameter of the
class :meth:`__init__` method.
"""
return dict(Encoder.get_required_params(), **{
"encoder_layers": int,
"hidden_size": int,
"num_heads": int,
"attention_dropout": float,
"filter_size": int,
"src_vocab_size": int,
"relu_dropout": float,
"layer_postprocess_dropout": float,
"remove_padding": bool,
})
[docs] @staticmethod
def get_optional_params():
"""Static method with description of optional parameters.
Returns:
dict:
Dictionary containing all the parameters that **can** be
included into the ``params`` parameter of the
class :meth:`__init__` method.
"""
return dict(Encoder.get_optional_params(), **{
'regularizer': None, # any valid TensorFlow regularizer
'regularizer_params': dict,
'initializer': None, # any valid TensorFlow initializer
'initializer_params': dict,
'pad_embeddings_2_eight': bool,
'norm_params': dict,
})
def __init__(self, params, model, name="transformer_encoder", mode='train' ):
super(TransformerEncoder, self).__init__(
params, model, name=name, mode=mode,
)
self.layers = []
self.output_normalization = None
self._mode = mode
self.embedding_softmax_layer = None
self.norm_params = self.params.get("norm_params", {"type": "layernorm_L2"})
self.regularizer = self.params.get("regularizer", None)
if self.regularizer != None:
self.regularizer_params = params.get("regularizer_params", {'scale': 0.0})
self.regularizer=self.regularizer(self.regularizer_params['scale']) \
if self.regularizer_params['scale'] > 0.0 else None
def _call(self, encoder_inputs, attention_bias, inputs_padding):
for n, layer in enumerate(self.layers):
# Run inputs through the sublayers.
self_attention_layer = layer[0]
feed_forward_network = layer[1]
with tf.variable_scope("layer_%d" % n):
with tf.variable_scope("self_attention"):
encoder_inputs = self_attention_layer(encoder_inputs, attention_bias)
with tf.variable_scope("ffn"):
encoder_inputs = feed_forward_network(encoder_inputs, inputs_padding)
return self.output_normalization(encoder_inputs)
def _encode(self, input_dict):
training = (self.mode == "train")
if len(self.layers) == 0:
# prepare encoder graph
self.embedding_softmax_layer = embedding_layer.EmbeddingSharedWeights(
self.params["src_vocab_size"], self.params["hidden_size"],
pad_vocab_to_eight=self.params.get('pad_embeddings_2_eight', False),
)
for _ in range(self.params['encoder_layers']):
# Create sublayers for each layer.
self_attention_layer = attention_layer.SelfAttention(
hidden_size=self.params["hidden_size"],
num_heads=self.params["num_heads"],
attention_dropout=self.params["attention_dropout"],
train=training,
regularizer=self.regularizer
)
feed_forward_network = ffn_layer.FeedFowardNetwork(
hidden_size=self.params["hidden_size"],
filter_size=self.params["filter_size"],
relu_dropout=self.params["relu_dropout"],
train=training,
regularizer=self.regularizer
)
self.layers.append([
PrePostProcessingWrapper(self_attention_layer, self.params,
training),
PrePostProcessingWrapper(feed_forward_network, self.params,
training)
])
# final normalization layer.
print("Encoder:", self.norm_params["type"], self.mode)
if self.norm_params["type"] =="batch_norm":
self.output_normalization = Transformer_BatchNorm(
training=training,
params=self.norm_params)
else:
self.output_normalization = LayerNormalization(
hidden_size=self.params["hidden_size"], params=self.norm_params)
# actual encoder part
with tf.name_scope("encode"):
inputs = input_dict['source_tensors'][0]
# Prepare inputs to the layer stack by adding positional encodings and
# applying dropout.
embedded_inputs = self.embedding_softmax_layer(inputs)
if self.params["remove_padding"]:
inputs_padding = utils.get_padding(inputs)
#inputs_padding = utils.get_padding(inputs,dtype=self._params["dtype"])
else:
inputs_padding = None
inputs_attention_bias = utils.get_padding_bias(inputs)
# inputs_attention_bias = utils.get_padding_bias(inputs, dtype=self._params["dtype"])
with tf.name_scope("add_pos_encoding"):
length = tf.shape(embedded_inputs)[1]
pos_encoding = utils.get_position_encoding(
length, self.params["hidden_size"],
)
encoder_inputs = embedded_inputs + tf.cast(x=pos_encoding,
dtype=embedded_inputs.dtype)
if self.mode == "train":
encoder_inputs = tf.nn.dropout(encoder_inputs,
keep_prob = 1.0 - self.params["layer_postprocess_dropout"],
)
encoded = self._call(encoder_inputs, inputs_attention_bias,
inputs_padding)
return {'outputs': encoded,
'inputs_attention_bias': inputs_attention_bias,
'state': None,
'src_lengths': input_dict['source_tensors'][1],
'embedding_softmax_layer': self.embedding_softmax_layer,
'encoder_input': inputs}