Source code for encoders.cnn_encoder

# Copyright (c) 2018 NVIDIA Corporation
"""
This module contains classes and functions to build "general" convolutional
neural networks from the description of arbitrary "layers".
"""
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import copy

import tensorflow as tf

try:
  from inspect import signature
except ImportError:
  from funcsigs import signature

from open_seq2seq.utils.utils import deco_print
from .encoder import Encoder


[docs]def build_layer(inputs, layer, layer_params, data_format,
                regularizer, training, verbose=True):
  """This function builds a layer from the layer function and it's parameters.

  It will automatically add regularizer parameter to the layer_params if the
  layer supports regularization. To check this, it will look for the
  "regularizer", "kernel_regularizer" and "gamma_regularizer" names in this
  order in the ``layer`` call signature. If one of this parameters is supported
  it will pass regularizer object as a value for that parameter. Based on the
  same "checking signature" technique "data_format" and "training" parameters
  will try to be added. Finally, "axis" parameter will try to be specified with
  axis = ``1 if data_format == 'channels_first' else 3``. This is required for
  automatic building batch normalization layer.

  Args:
    inputs: input Tensor that will be passed to the layer. Note that layer has
        to accept input as the first parameter.
    layer: layer function or class with ``__call__`` method defined.
    layer_params (dict): parameters passed to the ``layer``.
    data_format (string): data format ("channels_first" or "channels_last")
        that will be tried to be passed as an additional argument.
    regularizer: regularizer instance that will be tried to be passed as an
        additional argument.
    training (bool): whether layer is built in training mode. Will be tried to
        be passed as an additional argument.
    verbose (bool): whether to print information about built layers.

  Returns:
    Tensor with layer output.
  """
  layer_params_cp = copy.deepcopy(layer_params)
  for reg_name in ['regularizer', 'kernel_regularizer', 'gamma_regularizer']:
    if reg_name not in layer_params_cp and \
       reg_name in signature(layer).parameters:
      layer_params_cp.update({reg_name: regularizer})

  if 'data_format' not in layer_params_cp and \
     'data_format' in signature(layer).parameters:
    layer_params_cp.update({'data_format': data_format})

  # necessary to check axis for correct batch normalization processing
  if 'axis' not in layer_params_cp and \
     'axis' in signature(layer).parameters:
    layer_params_cp.update({'axis': 1 if data_format == 'channels_first' else 3})

  if 'training' not in layer_params_cp and \
     'training' in signature(layer).parameters:
    layer_params_cp.update({'training': training})

  outputs = layer(inputs, **layer_params_cp)

  if verbose:
    if hasattr(layer, '_tf_api_names'):
      layer_name = layer._tf_api_names[0]
    else:
      layer_name = layer
    deco_print("Building layer: {}(inputs, {})".format(
        layer_name,
        ", ".join("{}={}".format(key, value)
                  for key, value in layer_params_cp.items())
    ))
  return outputs


[docs]class CNNEncoder(Encoder):
  """General CNN encoder that can be used to construct various different models.
  """
[docs]  @staticmethod
  def get_required_params():
    return dict(Encoder.get_required_params(), **{
        'cnn_layers': list,
    })

[docs]  @staticmethod
  def get_optional_params():
    return dict(Encoder.get_optional_params(), **{
        'data_format': ['channels_first', 'channels_last'],
        'fc_layers': list,
    })

[docs]  def __init__(self, params, model, name="cnn_encoder", mode='train'):
    """CNN Encoder constructor.

    See parent class for arguments description.

    Config parameters:

    * **cnn_layers** (list) --- list with the description of "convolutional"
      layers. For example::
        "conv_layers": [
            (tf.layers.conv2d, {
                'filters': 64, 'kernel_size': (11, 11),
                'strides': (4, 4), 'padding': 'VALID',
                'activation': tf.nn.relu,
            }),
            (tf.layers.max_pooling2d, {
                'pool_size': (3, 3), 'strides': (2, 2),
            }),
            (tf.layers.conv2d, {
                'filters': 192, 'kernel_size': (5, 5),
                'strides': (1, 1), 'padding': 'SAME',
            }),
            (tf.layers.batch_normalization, {'momentum': 0.9, 'epsilon': 0.0001}),
            (tf.nn.relu, {}),
        ]
      Note that you don't need to provide "regularizer", "training",
      "data_format" and "axis" parameters since they will be
      automatically added. "axis" will be derived from "data_format" and will
      be ``1 if data_format == "channels_first" else 3``.

    * **fc_layers** (list) --- list with the description of "fully-connected"
      layers. The only different from convolutional layers is that the input
      will be automatically reshaped to 2D (batch size x num features).
      For example::
        'fc_layers': [
            (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}),
            (tf.layers.dropout, {'rate': 0.5}),
            (tf.layers.dense, {'units': 4096, 'activation': tf.nn.relu}),
            (tf.layers.dropout, {'rate': 0.5}),
        ],
      Note that you don't need to provide "regularizer", "training",
      "data_format" and "axis" parameters since they will be
      automatically added. "axis" will be derived from "data_format" and will
      be ``1 if data_format == "channels_first" else 3``.

    * **data_format** (string) --- could be either "channels_first" or
      "channels_last". Defaults to "channels_first".
    """
    super(CNNEncoder, self).__init__(params, model, name, mode)

  def _encode(self, input_dict):
    regularizer = self.params.get('regularizer', None)
    data_format = self.params.get('data_format', 'channels_first')

    x = input_dict['source_tensors'][0]
    if data_format == 'channels_first':
      x = tf.transpose(x, [0, 3, 1, 2])

    for layer, layer_params in self.params['cnn_layers']:
      x = build_layer(x, layer, layer_params, data_format,
                      regularizer, self.mode == 'train')

    if data_format == 'channels_first':
      x = tf.transpose(x, [0, 2, 3, 1])

    fc_layers = self.params.get('fc_layers', [])

    # if fully connected layers exist, flattening the output and applying them
    if fc_layers:
      input_shape = x.get_shape().as_list()
      num_inputs = input_shape[1] * input_shape[2] * input_shape[3]
      x = tf.reshape(x, [-1, num_inputs])
      for layer, layer_params in fc_layers:
        x = build_layer(x, layer, layer_params, data_format, regularizer,
                        self.mode == 'train')
    else:
      # if there are no fully connected layers, doing average pooling
      x = tf.reduce_mean(x, [1, 2])

    return {'outputs': x}