Source code for optimizers.automatic_loss_scaler

# Copyright (c) 2018 NVIDIA Corporation
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import tensorflow as tf

from open_seq2seq.utils.utils import check_params

[docs]class AutomaticLossScaler(object): SUPPORTED_ALGOS = ['backoff', 'logmax'] def __init__(self, algorithm='Backoff', params=None): algorithm = algorithm.lower().strip() if algorithm == 'backoff': self.scaler = BackoffScaler(params) elif algorithm == 'logmax': self.scaler = LogMaxScaler(params) # ppf(.999) else: raise ValueError('Unknown scaling algorithm: {}'.format(algorithm))
[docs] def update_op(self, has_nan, amax): return self.scaler.update_op(has_nan, amax)
@property def loss_scale(self): return self.scaler.loss_scale
[docs] @staticmethod def check_grads(grads_and_vars): has_nan_ops = [] amax_ops = [] for grad, _ in grads_and_vars: if grad is not None: if isinstance(grad, tf.IndexedSlices): x = grad.values else: x = grad has_nan_ops.append(tf.reduce_any(tf.is_nan(x))) amax_ops.append(tf.reduce_max(tf.abs(x))) has_nan = tf.reduce_any(has_nan_ops) amax = tf.reduce_max(amax_ops) return has_nan, amax
[docs]class BackoffScaler(object): def __init__(self, params): if params is None: params = {} check_params( config=params, required_dict={}, optional_dict={ 'scale_min': float, 'scale_max': float, 'step_factor': float, 'step_window': int }, ) self.scale_min = params.get('scale_min', 1.0) self.scale_max = params.get('scale_max', 2.**14) self.step_factor = params.get('step_factor', 2.0) self.step_window = params.get('step_window', 2000) self.iteration = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.last_overflow_iteration = tf.Variable(initial_value=-1, trainable=False, dtype=tf.int64) self.scale = tf.Variable(initial_value=self.scale_max, trainable=False)
[docs] def update_op(self, has_nan, amax): def overflow_case(): new_scale_val = tf.clip_by_value(self.scale / self.step_factor, self.scale_min, self.scale_max) scale_assign = tf.assign(self.scale, new_scale_val) overflow_iter_assign = tf.assign(self.last_overflow_iteration, self.iteration) with tf.control_dependencies([scale_assign, overflow_iter_assign]): return tf.identity(self.scale) def scale_case(): since_overflow = self.iteration - self.last_overflow_iteration should_update = tf.equal(since_overflow % self.step_window, 0) def scale_update_fn(): new_scale_val = tf.clip_by_value(self.scale * self.step_factor, self.scale_min, self.scale_max) return tf.assign(self.scale, new_scale_val) return tf.cond(should_update, scale_update_fn, lambda: self.scale) iter_update = tf.assign_add(self.iteration, 1) overflow = tf.logical_or(has_nan, tf.is_inf(amax)) update_op = tf.cond(overflow, overflow_case, scale_case) with tf.control_dependencies([update_op]): return tf.identity(iter_update)
@property def loss_scale(self): return self.scale
[docs]class LogMaxScaler(object): def __init__(self, params): if params is None: params = {} check_params( config=params, required_dict={}, optional_dict={ 'scale_min': float, 'scale_max': float, 'log_max': float, 'beta1': float, 'beta2': float, 'overflow_std_dev': float }, ) self.scale_min = params.get('scale_min', 1.0) self.scale_max = params.get('scale_max', 2.**14) self.log_max = params.get('log_max', 16.) self.beta1 = params.get('beta1', 0.99) self.beta2 = params.get('beta2', 0.999) self.overflow_std_dev = params.get('overflow_std_dev', 3.09) self.iteration = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) self.scale = tf.Variable(initial_value=1.0, trainable=False) self.x_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.slow_x_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.xsquared_hat = tf.Variable(initial_value=0, trainable=False, dtype=tf.float32) self.b1_correction = tf.Variable(initial_value=1., trainable=False, dtype=tf.float32) self.b2_correction = tf.Variable(initial_value=1., trainable=False, dtype=tf.float32) # NB: assumes that `amax` is already has been downscaled
[docs] def update_op(self, has_nan, amax): is_nonfinite = tf.logical_or(has_nan, tf.is_inf(amax)) x = tf.cond(is_nonfinite, lambda: tf.pow(2., self.log_max), lambda: tf.log(amax) / tf.log(tf.constant(2.))) x_hat_assn = tf.assign(self.x_hat, self.beta1 * self.x_hat + (1 - self.beta1) * x) b1_corr_assn = tf.assign(self.b1_correction, self.b1_correction * self.beta1) with tf.control_dependencies([x_hat_assn, b1_corr_assn]): mu = self.x_hat.read_value() / (1 - self.b1_correction.read_value()) slow_x_hat_assn = tf.assign(self.slow_x_hat, self.beta2 * self.slow_x_hat + (1 - self.beta2) * x) xsquared_hat_assn = tf.assign( self.xsquared_hat, self.beta2 * self.xsquared_hat + (1 - self.beta2) * (x * x), ) b2_corr_assn = tf.assign(self.b2_correction, self.b2_correction * self.beta2) with tf.control_dependencies([slow_x_hat_assn, xsquared_hat_assn, b2_corr_assn]): e_xsquared = self.xsquared_hat.read_value() / \ (1 - self.b2_correction.read_value()) slow_mu = self.slow_x_hat.read_value() / \ (1 - self.b2_correction.read_value()) sigma2 = e_xsquared - (slow_mu * slow_mu) sigma = tf.sqrt(tf.maximum(sigma2, tf.constant(0.))) log_cutoff = sigma * self.overflow_std_dev + mu log_difference = 16 - log_cutoff proposed_scale = tf.pow(2., log_difference) scale_update = tf.assign( self.scale, tf.clip_by_value(proposed_scale, self.scale_min, self.scale_max), ) iter_update = tf.assign_add(self.iteration, 1) with tf.control_dependencies([scale_update]): return tf.identity(iter_update)
@property def loss_scale(self): return self.scale