Source code for parts.convs2s.attention_wn_layer

"""Implementation of the attention layer for convs2s.
Inspired from https://github.com/tobyyouup/conv_seq2seq"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import tensorflow as tf
import math
from open_seq2seq.parts.convs2s.ffn_wn_layer import FeedFowardNetworkNormalized


[docs]class AttentionLayerNormalized(tf.layers.Layer): """Attention layer for convs2s with weight normalization"""
[docs] def __init__(self, in_dim, embed_size, layer_id, add_res, mode, scaling_factor=math.sqrt(0.5), normalization_type="weight_norm", regularizer=None, init_var=None, ): """initializes the attention layer. It uses weight normalization for linear projections (Salimans & Kingma, 2016) w = g * v/2-norm(v) Args: in_dim: int last dimension of the inputs embed_size: int target embedding size layer_id: int the id of current convolution layer add_res: bool whether residual connection should be added or not mode: str current mode """ super(AttentionLayerNormalized, self).__init__() self.add_res = add_res self.scaling_factor = scaling_factor self.regularizer = regularizer with tf.variable_scope("attention_layer_" + str(layer_id)): # linear projection layer to project the attention input to target space self.tgt_embed_proj = FeedFowardNetworkNormalized( in_dim, embed_size, dropout=1.0, var_scope_name="att_linear_mapping_tgt_embed", mode=mode, normalization_type=normalization_type, regularizer=self.regularizer, init_var=init_var ) # linear projection layer to project back to the input space self.out_proj = FeedFowardNetworkNormalized( embed_size, in_dim, dropout=1.0, var_scope_name="att_linear_mapping_out", mode=mode, normalization_type=normalization_type, regularizer=self.regularizer, init_var=init_var )
[docs] def call(self, input, target_embed, encoder_output_a, encoder_output_b, input_attention_bias): """Calculates the attention vectors. Args: input: A float32 tensor with shape [batch_size, length, in_dim] target_embed: A float32 tensor with shape [batch_size, length, in_dim] containing the target embeddings encoder_output_a: A float32 tensor with shape [batch_size, length, out_dim] containing the first encoder outputs, uses as the keys encoder_output_b: A float32 tensor with shape [batch_size, length, src_emb_dim] containing the second encoder outputs, uses as the values input_attention_bias: A float32 tensor with shape [batch_size, length, 1] containing the bias used to mask the paddings Returns: float32 tensor with shape [batch_size, length, out_dim]. """ h_proj = self.tgt_embed_proj(input) d_proj = (h_proj + target_embed) * self.scaling_factor att_score = tf.matmul(d_proj, encoder_output_a, transpose_b=True) # Masking need to be done in float32. Added to support mixed-precision training. att_score = tf.cast(x=att_score, dtype=tf.float32) # mask out the paddings if input_attention_bias is not None: att_score = att_score + input_attention_bias att_score = tf.nn.softmax(att_score) # Cast back to original type att_score = tf.cast(x=att_score, dtype=encoder_output_b.dtype) length = tf.cast(tf.shape(encoder_output_b), encoder_output_b.dtype) output = tf.matmul(att_score, encoder_output_b) * \ length[1] * tf.cast(tf.sqrt(1.0 / length[1]), dtype=encoder_output_b.dtype) output = self.out_proj(output) if self.add_res: output = (output + input) * self.scaling_factor return output