Source code for optimizers.lr_policies

# Copyright (c) 2017 NVIDIA Corporation
"""
Module containing various learning rate policies. Learning rate policy can
be any function that takes arbitrary arguments from the config (with additional
``global_step`` variable provided automatically) and returns learning rate
value for the current step.
"""
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import math
import tensorflow as tf
from tensorflow.python.framework import ops


[docs]def fixed_lr(global_step, learning_rate): """Fixed learning rate policy. This function always returns ``learning_rate``, ignoring ``global_step`` value. Args: global_step: global step TensorFlow tensor (ignored for this policy). learning_rate (float): fixed learning rate to use. Returns: learning rate at step ``global_step``. """ return learning_rate
[docs]def piecewise_constant(global_step, learning_rate, boundaries, decay_rates, steps_per_epoch=None): """Piecewise constant learning rate decay. When defined in the config, only ``boundaries`` and ``decay_rates`` need to be provided (other parameters are automatically populated by :class:`Model<models.model.Model>` class). ``boundaries`` are treated as epochs if ``num_epochs`` is provided in the config, otherwise treated as steps. Args: global_step: global step TensorFlow tensor. learning_rate (float): initial learning rate to use. boundaries (list): could be either defined in steps (if ``batches_per_epoch=None``) or in epochs if ``batches_per_epoch`` parameter is defined. decay_rates: multiplier of the initial learning rate for each boundary. steps_per_epoch: number of batches in one training epoch. If provided, boundaries are treated as epochs, otherwise as steps. Returns: learning rate at step ``global_step``. """ if steps_per_epoch is not None: boundaries = [steps_per_epoch * epoch for epoch in boundaries] decay_rates = [1.0] + decay_rates vals = [learning_rate * decay for decay in decay_rates] return tf.train.piecewise_constant(global_step, boundaries, vals)
[docs]def exp_decay(global_step, learning_rate, decay_steps, decay_rate, use_staircase_decay, begin_decay_at=0, min_lr=0.0): """Exponential decay learning rate policy. This function is equivalent to ``tensorflow.train.exponential_decay`` with some additional functionality. Namely, it adds ``begin_decay_at`` parameter and ``min_lr`` parameter which are the first step to start decaying learning rate and minimal value of the learning rate correspondingly. Args: global_step: global step TensorFlow tensor. learning_rate (float): initial learning rate to use. decay_steps (int): number of steps to apply decay for. decay_rate (float): the rate of the decay. use_staircase_decay (bool): whether to use staircase decay. begin_decay_at (int): the first step to start decaying learning rate. min_lr (float): minimal value of the learning rate. Returns: learning rate at step ``global_step``. """ new_lr = tf.cond( global_step < begin_decay_at, lambda: learning_rate, lambda: tf.train.exponential_decay( learning_rate, global_step - begin_decay_at, decay_steps, decay_rate, staircase=use_staircase_decay), name="learning_rate", ) final_lr = tf.maximum(min_lr, new_lr) return final_lr
[docs]def poly_decay(global_step, learning_rate, decay_steps, power=1.0, begin_decay_at=0, min_lr=0.0, warmup_steps=0): """Polynomial decay learning rate policy. This function is equivalent to ``tensorflow.train.polynomial_decay`` with some additional functionality. Namely, it adds ``begin_decay_at`` parameter which is the first step to start decaying learning rate. Args: global_step: global step TensorFlow tensor. learning_rate (float): initial learning rate to use. decay_steps (int): number of steps to apply decay for. power (float): power for polynomial decay. begin_decay_at (int): the first step to start decaying learning rate. min_lr (float): minimal value of the learning rate (same as ``end_learning_rate`` TensorFlow parameter). Returns: learning rate at step ``global_step``. """ begin_decay_at = max(warmup_steps, begin_decay_at) if warmup_steps > 0: # g_step = tf.cast(global_step, dtype=tf.float32) # warmup = tf.cast(warmup_steps, dtype=tf.float32) learning_rate = tf.cond( global_step < warmup_steps, lambda: (learning_rate*tf.cast(global_step,tf.float32)/tf.cast(warmup_steps,tf.float32)), lambda: learning_rate, ) lr = tf.cond( global_step < begin_decay_at, lambda: learning_rate, lambda: tf.train.polynomial_decay( learning_rate, global_step=global_step-begin_decay_at, decay_steps=decay_steps, end_learning_rate=min_lr, power=power), name="learning_rate" ) return lr
[docs]def transformer_policy(global_step, learning_rate, d_model, warmup_steps, max_lr=None, coefficient=1.0, dtype=tf.float32): """Transformer's learning rate policy from https://arxiv.org/pdf/1706.03762.pdf with a hat (max_lr) (also called "noam" learning rate decay scheme). Args: global_step: global step TensorFlow tensor (ignored for this policy). learning_rate (float): initial learning rate to use. d_model (int): model dimensionality. warmup_steps (int): number of warm-up steps. max_lr (float): maximal learning rate, i.e. hat. coefficient (float): optimizer adjustment. Recommended 0.002 if using "Adam" else 1.0. dtype: dtype for this policy. Returns: learning rate at step ``global_step``. """ step_num = tf.cast(global_step, dtype=dtype) ws = tf.cast(warmup_steps, dtype=dtype) decay = coefficient * d_model ** -0.5 * tf.minimum( (step_num + 1) * ws ** -1.5, (step_num + 1) ** -0.5 ) new_lr = decay * learning_rate if max_lr is not None: return tf.minimum(max_lr, new_lr) return new_lr
[docs]def inv_poly_decay(global_step, learning_rate, decay_steps, min_lr, power=1.0, begin_decay_at=0, warmup_steps=0, name="learning_rate"): """Inverse poly decay learning rate policy. lr = initial lr / ( 1+ decay * t)^power This function is similar to ``tensorflow.train.inv_time_decay`` with some additional functionality. Namely, it adds : ``min_lr`` - end learning rate with 0.00001 ``power`` - power ``begin_decay_at``- first step to start decaying learning rate. Args: global_step: global step TensorFlow tensor. learning_rate (float): initial learning rate to use. decay_steps (int): number of steps to apply decay for. power (float): power for inv_time_decay. begin_decay_at (int): the first step to start decaying learning rate. min_lr (float): minimal value of the learning rate (same as ``end_learning_rate`` TensorFlow parameter). Returns: learning rate at step ``global_step``. """ min_lr=max(min_lr, 1e-8) min_lr=min(min_lr, learning_rate) if power <= 0.: raise ValueError("Inv poly decay requires power > 0.") if global_step is None: raise ValueError("Inv poly decay requires global_step") with ops.name_scope(name, "InvDecay", [learning_rate, global_step]) as name: scale = (math.pow(learning_rate / min_lr, 1./power) - 1.) / decay_steps learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") decay_steps = tf.cast(decay_steps, tf.float32) global_step = tf.cast(global_step, tf.float32) denom = tf.pow(1. + scale * global_step , power) lr = tf.div(learning_rate, denom, name=name) return lr