Source code for utils.hooks

# Copyright (c) 2017 NVIDIA Corporation
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import math
import os
import time

import tensorflow as tf

from open_seq2seq.utils.utils import deco_print, log_summaries_from_dict, \
                                     get_results_for_epoch


[docs]class BroadcastGlobalVariablesHook(tf.train.SessionRunHook):
  """
  SessionRunHook that will broadcast all global variables from root rank
  to all other processes during initialization.
  This is necessary to ensure consistent initialization of all workers when
  training is started with random weights or restored from a checkpoint.
  """

[docs]  def __init__(self, root_rank, device=''):
    """Construct a new BroadcastGlobalVariablesHook that will broadcast all
    global variables from root rank to all other processes during initialization.
    Args:
      root_rank:
        Rank that will send data, other ranks will receive data.
      device:
        Device to be used for broadcasting. Uses GPU by default
        if Horovod was build with HOROVOD_GPU_BROADCAST.
    """
    super(BroadcastGlobalVariablesHook, self).__init__()
    self.root_rank = root_rank
    self.bcast_op = None
    self.device = device

[docs]  def begin(self):
    def broadcast_global_variables(root_rank):
      from horovod.tensorflow.mpi_ops import broadcast
      ops = []
      for var in tf.global_variables():
        if var.dtype.base_dtype == tf.float16:
          ops.append(tf.assign(var, tf.cast(broadcast(tf.cast(var, tf.float32),
                                                      root_rank), tf.float16)))
        else:
          ops.append(tf.assign(var, broadcast(var, root_rank)))
      return tf.group(*ops)

    if not self.bcast_op or self.bcast_op.graph != tf.get_default_graph():
      with tf.device(self.device):
        self.bcast_op = broadcast_global_variables(self.root_rank)

[docs]  def after_create_session(self, session, coord):
    session.run(self.bcast_op)


[docs]class PrintSamplesHook(tf.train.SessionRunHook):
  """Session hook that prints training samples and prediction from time to time
  """
  def __init__(self, every_steps, model):
    super(PrintSamplesHook, self).__init__()
    self._timer = tf.train.SecondOrStepTimer(every_steps=every_steps)
    self._iter_count = 0
    self._global_step = None
    self._model = model
    # using only first GPU
    output_tensors = model.get_output_tensors(0)
    self._fetches = [
        model.get_data_layer(0).input_tensors,
        output_tensors,
    ]

[docs]  def begin(self):
    self._iter_count = 0
    self._global_step = tf.train.get_global_step()

[docs]  def before_run(self, run_context):
    if self._timer.should_trigger_for_step(self._iter_count):
      return tf.train.SessionRunArgs([self._fetches, self._global_step])
    return tf.train.SessionRunArgs([[], self._global_step])

[docs]  def after_run(self, run_context, run_values):
    results, step = run_values.results
    self._iter_count = step

    if not results:
      return
    self._timer.update_last_triggered_step(self._iter_count - 1)

    input_values, output_values = results
    dict_to_log = self._model.maybe_print_logs(input_values, output_values, step)
    # optionally logging to tensorboard any values
    # returned from maybe_print_logs
    if self._model.params['save_summaries_steps'] and dict_to_log:
      log_summaries_from_dict(
          dict_to_log,
          self._model.params['logdir'],
          step,
      )


[docs]class PrintLossAndTimeHook(tf.train.SessionRunHook):
  """Session hook that prints training samples and prediction from time to time
  """
  def __init__(self, every_steps, model, print_ppl=False):
    super(PrintLossAndTimeHook, self).__init__()
    self._timer = tf.train.SecondOrStepTimer(every_steps=every_steps)
    self._every_steps = every_steps
    self._iter_count = 0
    self._global_step = None
    self._model = model
    self._fetches = [model.loss]
    self._last_time = time.time()
    self._print_ppl = print_ppl

[docs]  def begin(self):
    self._iter_count = 0
    self._global_step = tf.train.get_global_step()

[docs]  def before_run(self, run_context):
    if self._timer.should_trigger_for_step(self._iter_count):
      return tf.train.SessionRunArgs([self._fetches, self._global_step])
    return tf.train.SessionRunArgs([[], self._global_step])

[docs]  def after_run(self, run_context, run_values):
    results, step = run_values.results
    self._iter_count = step

    if not results:
      return
    self._timer.update_last_triggered_step(self._iter_count - 1)

    if self._model.steps_in_epoch is None:
      deco_print("Global step {}:".format(step), end=" ")
    else:
      deco_print(
          "Epoch {}, global step {}:".format(
              step // self._model.steps_in_epoch, step),
          end=" ",
      )

    loss = results[0]
    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      if self._print_ppl:
        deco_print("Train loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
                   .format(loss, math.exp(loss),
                           loss/math.log(2)),
                   start="", end=", ")
      else:
        deco_print(
          "Train loss: {:.4f} ".format(loss),
          offset=4)

    tm = (time.time() - self._last_time) / self._every_steps
    m, s = divmod(tm, 60)
    h, m = divmod(m, 60)

    deco_print(
        "time per step = {}:{:02}:{:.3f}".format(int(h), int(m), s),
        start="",
    )
    self._last_time = time.time()


[docs]class RunEvaluationHook(tf.train.SessionRunHook):
  """Session hook that runs evaluation on a validation set
  """
  def __init__(self, every_steps, model, last_step=-1, print_ppl=False):
    super(RunEvaluationHook, self).__init__()
    self._timer = tf.train.SecondOrStepTimer(every_steps=every_steps)
    self._iter_count = 0
    self._global_step = None
    self._model = model
    self._triggered = False
    self._last_step = last_step
    self._eval_saver = tf.train.Saver(
        save_relative_paths=True,
        max_to_keep=self._model.params['num_checkpoints']
    )
    self._best_eval_loss = 1e9
    self._print_ppl = print_ppl

[docs]  def begin(self):
    self._iter_count = 0
    self._global_step = tf.train.get_global_step()

[docs]  def before_run(self, run_context):
    self._triggered = self._timer.should_trigger_for_step(self._iter_count)
    return tf.train.SessionRunArgs([[], self._global_step])

[docs]  def after_run(self, run_context, run_values):
    results, step = run_values.results
    self._iter_count = step

    if not self._triggered and step != self._last_step - 1:
      return
    self._timer.update_last_triggered_step(self._iter_count - 1)

    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      deco_print("Running evaluation on a validation set:")

    results_per_batch, total_loss = get_results_for_epoch(
        self._model, run_context.session, mode="eval", compute_loss=True,
    )


    if not self._model.on_horovod or self._model.hvd.rank() == 0:
      if self._print_ppl:
        deco_print("Validation loss: {:.4f} | ppl = {:.4f} | bpc = {:.4f}"
                   .format(total_loss, math.exp(total_loss),
                           total_loss/math.log(2)), offset=4)
      else:
        deco_print(
          "Validation loss: {:.4f} ".format(total_loss),
          offset=4)


      dict_to_log = self._model.finalize_evaluation(results_per_batch, step)
      dict_to_log['eval_loss'] = total_loss

      if self._print_ppl:
        # Add bpc and ppl metrics to tensorboard
        dict_to_log['ppl'] = math.exp(total_loss)
        dict_to_log['bpc'] = math.exp(total_loss/math.log(2))
        
      # saving the best validation model
      if self._model.params['save_checkpoint_steps'] and \
         total_loss < self._best_eval_loss:
        self._best_eval_loss = total_loss
        self._eval_saver.save(
            run_context.session,
            os.path.join(self._model.params['logdir'], 'best_models',
                         'val_loss={:.4f}-step'.format(total_loss)),
            global_step=step + 1,
        )

      # optionally logging to tensorboard any values
      # returned from maybe_print_logs
      if self._model.params['save_summaries_steps']:
        log_summaries_from_dict(
            dict_to_log,
            self._model.params['logdir'],
            step,
        )