Source code for data.text2speech.text2speech

# Copyright (c) 2018 NVIDIA Corporation
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import os
import six
import librosa
import numpy as np
import tensorflow as tf
import pandas as pd

from six import string_types

from open_seq2seq.data.data_layer import DataLayer
from open_seq2seq.data.utils import load_pre_existing_vocabulary
from .speech_utils import get_speech_features_from_file,\
                          inverse_mel, normalize, denormalize

[docs]class Text2SpeechDataLayer(DataLayer):
  """
  Text-to-speech data layer class
  """

[docs]  @staticmethod
  def get_required_params():
    return dict(
        DataLayer.get_required_params(), **{
            'dataset_location': str,
            'dataset': ['LJ', 'MAILABS'],
            'num_audio_features': None,
            'output_type': ['magnitude', 'mel', 'both'],
            'vocab_file': str,
            'dataset_files': list,
            'feature_normalize': bool,
        }
    )

[docs]  @staticmethod
  def get_optional_params():
    return dict(
        DataLayer.get_optional_params(), **{
            'pad_to': int,
            'mag_power': int,
            'pad_EOS': bool,
            'pad_value': float,
            'feature_normalize_mean': float,
            'feature_normalize_std': float,
            'trim': bool,
            'data_min': None,
            'duration_min': int,
            'duration_max': int,
            'mel_type': ['slaney', 'htk'],
            "exp_mag": bool,
            'style_input': [None, 'wav'],
            'n_samples_train': int,
            'n_samples_eval': int,
            'n_fft': int,
            'fmax': float,
            'max_normalization': bool,
            'use_cache': bool
        }
    )

[docs]  def __init__(self, params, model, num_workers=None, worker_id=None):
    """Text-to-speech data layer constructor.

    See parent class for arguments description.

    Config parameters:

    * **dataset** (str) --- The dataset to use. Currently 'LJ' for the LJSpeech
      1.1 dataset is supported.
    * **num_audio_features** (int) --- number of audio features to extract.
    * **output_type** (str) --- could be either "magnitude", or "mel".
    * **vocab_file** (str) --- path to vocabulary file.
    * **dataset_files** (list) --- list with paths to all dataset .csv files.
      File is assumed to be separated by "|".
    * **dataset_location** (string) --- string with path to directory where wavs
      are stored.
    * **feature_normalize** (bool) --- whether to normlize the data with a
      preset mean and std
    * **feature_normalize_mean** (bool) --- used for feature normalize.
      Defaults to 0.
    * **feature_normalize_std** (bool) --- used for feature normalize.
      Defaults to 1.
    * **mag_power** (int) --- the power to which the magnitude spectrogram is
      scaled to. Defaults to 1.
      1 for energy spectrogram
      2 for power spectrogram
      Defaults to 2.
    * **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and
      the speech signal. Will pad at least 1 token regardless of pad_to value.
      Defaults to True.
    * **pad_value** (float) --- The value we pad the spectrogram with. Defaults
      to np.log(data_min).
    * **pad_to** (int) --- we pad such that the resulting datapoint is a
      multiple of pad_to.
      Defaults to 8.
    * **trim** (bool) --- Whether to trim silence via librosa or not. Defaults
      to False.
    * **data_min** (float) --- min clip value prior to taking the log. Defaults
      to 1e-5. Please change to 1e-2 if using htk mels.
    * **duration_min** (int) --- Minimum duration in steps for speech signal.
      All signals less than this will be cut from the training set. Defaults to
      0.
    * **duration_max** (int) --- Maximum duration in steps for speech signal.
      All signals greater than this will be cut from the training set. Defaults
      to 4000.
    * **mel_type** (str) --- One of ['slaney', 'htk']. Decides which algorithm to
      use to compute mel specs.
      Defaults to htk.
    * **style_input** (str) --- Can be either None or "wav". Must be set to "wav"
      for GST. Defaults to None.
    * **n_samples_train** (int) --- number of the shortest examples to use for training.
    * **n_samples_eval** (int) --- number of the shortest examples to use for evaluation.
    * **n_fft** (int) --- FFT window size.
    * **fmax** (float) --- highest frequency to use.
    * **max_normalization** (bool) --- whether to divide the final audio signal
      by its' absolute maximum.
    * **use_cache** (bool) --- whether to use cache.

    """
    super(Text2SpeechDataLayer, self).__init__(
        params,
        model,
        num_workers,
        worker_id
    )

    self.use_cache = self.params.get('use_cache', False)
    self._cache = {}

    names = ['wav_filename', 'raw_transcript', 'transcript']
    sep = '\x7c'
    header = None

    if self.params["dataset"] == "LJ":
      self._sampling_rate = 22050
      self._n_fft = self.params.get("n_fft", 1024)
    elif self.params["dataset"] == "MAILABS":
      self._sampling_rate = 16000
      self._n_fft = 800

    # Character level vocab
    self.params['char2idx'] = load_pre_existing_vocabulary(
        self.params['vocab_file'],
        min_idx=3,
        read_chars=True,
    )
    # Add the pad, start, and end chars
    self.params['char2idx']['<p>'] = 0
    self.params['char2idx']['<s>'] = 1
    self.params['char2idx']['</s>'] = 2
    self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()}
    self.params['src_vocab_size'] = len(self.params['char2idx'])

    self.max_normalization = self.params.get('max_normalization', False)

    n_feats = self.params['num_audio_features']
    if "both" in self.params["output_type"]:
      self._both = True
      if self.params["feature_normalize"]:
        raise ValueError(
            "feature normalize is not currently enabled for both mode"
        )
      if not isinstance(n_feats, dict):
        raise ValueError(
            "num_audio_features must be a dictionary for both mode"
        )
      else:
        if ("mel" not in n_feats and
            "magnitude" not in n_feats):
          raise ValueError(
            "num_audio_features must contain mel and magnitude keys"
          )
        elif (not isinstance(n_feats["mel"], int) or
              not isinstance(n_feats["magnitude"], int)):
            raise ValueError(
                "num_audio_features must be a int"
            )
      n_mels = n_feats['mel']
      data_min = self.params.get("data_min", None)
      if data_min is not None:
        if not isinstance(data_min, dict):
          raise ValueError(
              "data_min must be a dictionary for both mode"
          )
        else:
          if "mel" not in data_min and "magnitude" not in data_min:
            raise ValueError(
              "data_min must contain mel and magnitude keys"
            )
          elif (not isinstance(data_min["mel"], float) or 
                not isinstance(data_min["magnitude"], float)):
            raise ValueError(
                "data_min must be a float"
            )
      self._exp_mag = self.params.get("exp_mag", True)
    else:
      if not isinstance(n_feats, int):
        raise ValueError(
            "num_audio_features must be a float for mel or magnitude mode"
        )
      if not isinstance(self.params.get("data_min",1.0), float):
        raise ValueError(
            "data_min must be a float for mel or magnitude mode"
        )
      self._both = False
      self._exp_mag = False
      n_mels = n_feats

    self._mel = "mel" in self.params["output_type"]

    if self._mel or self._both:
      htk = True
      norm = None
      if self.params.get('mel_type', 'htk') == 'slaney':
        htk = False
        norm = 1
      self._mel_basis = librosa.filters.mel(
          sr=self._sampling_rate,
          n_fft=self._n_fft,
          n_mels=n_mels,
          htk=htk,
          norm=norm,
          fmax=self.params.get('fmax', None)
      )
    else:
      self._mel_basis = None

    if self.params["interactive"]:
      return

    # Load csv files
    self._files = None
    for csvs in params['dataset_files']:
      files = pd.read_csv(
          csvs,
          encoding='utf-8',
          sep=sep,
          header=header,
          names=names,
          quoting=3
      )
      if self._files is None:
        self._files = files
      else:
        self._files = self._files.append(files)

    if self.params['mode'] == 'train' and 'n_samples_train' in self.params:
      indices = self._files['transcript'].str.len().sort_values().index
      self._files = self._files.reindex(indices)

      n_samples = self.params.get('n_samples_train')
      print('Using just the {} shortest samples'.format(n_samples))
      self._files = self._files.iloc[:n_samples]

    if self.params['mode'] == 'eval':
      indices = self._files['transcript'].str.len().sort_values().index
      self._files = self._files.reindex(indices)

      if 'n_samples_eval' in self.params:
        n_samples = self.params['n_samples_eval']
        self._files = self._files.iloc[:n_samples]

    if (self.params['mode'] != 'infer'
        or self.params.get("style_input", None) == "wav"):
      cols = ['wav_filename', 'transcript']
    else:
      cols = 'transcript'

    all_files = self._files.loc[:, cols].values
    self._files = self.split_data(all_files)

    self._size = self.get_size_in_samples()
    self._dataset = None
    self._iterator = None
    self._input_tensors = None

[docs]  def split_data(self, data):
    if self.params['mode'] != 'train' and self._num_workers is not None:
      size = len(data)
      start = size // self._num_workers * self._worker_id
      if self._worker_id == self._num_workers - 1:
        end = size
      else:
        end = size // self._num_workers * (self._worker_id + 1)
      return data[start:end]
    return data

  @property
  def iterator(self):
    return self._iterator

[docs]  def build_graph(self):
    with tf.device('/cpu:0'):
      """Builds data reading graph."""

      self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
      if self.params['shuffle']:
        self._dataset = self._dataset.shuffle(self._size)
      self._dataset = self._dataset.repeat()

      if self._both:
        num_audio_features = self.params['num_audio_features']['mel']
        num_audio_features += self.params['num_audio_features']['magnitude']
      else:
        num_audio_features = self.params['num_audio_features']

      if (self.params['mode'] != 'infer'
          or self.params.get("style_input", None) == "wav"):
        self._dataset = self._dataset.map(
            lambda line: tf.py_func(
                self._parse_audio_transcript_element,
                [line],
                [tf.int32, tf.int32, self.params['dtype'], self.params['dtype'],\
                 tf.int32],
                stateful=False,
            ),
            num_parallel_calls=8,
        )
        if (self.params.get("duration_max", None) or
            self.params.get("duration_max", None)):
          self._dataset = self._dataset.filter(
              lambda txt, txt_len, spec, stop, spec_len:
                  tf.logical_and(
                      tf.less_equal(
                          spec_len,
                          self.params.get("duration_max", 4000)
                      ),
                      tf.greater_equal(
                          spec_len,
                          self.params.get("duration_min", 0)
                      )
                  )
          )
        if self._both:
          default_pad_value = 0.
        else:
          default_pad_value = np.log(self.params.get("data_min", 1e-5))
        pad_value = self.params.get("pad_value", default_pad_value)
        if self.params["feature_normalize"]:
          pad_value = self._normalize(pad_value)
        self._dataset = self._dataset.padded_batch(
            self.params['batch_size'],
            padded_shapes=(
                [None], 1, [None, num_audio_features], [None], 1
            ),
            padding_values=(
                0, 0, tf.cast(pad_value, dtype=self.params['dtype']),
                tf.cast(1., dtype=self.params['dtype']), 0
            )
        )
      else:
        self._dataset = self._dataset.map(
            lambda line: tf.py_func(
                self._parse_transcript_element,
                [line],
                [tf.int32, tf.int32],
                stateful=False,
            ),
            num_parallel_calls=8,
        )
        self._dataset = self._dataset.padded_batch(
            self.params['batch_size'], padded_shapes=([None], 1)
        )

      self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE)\
                                    .make_initializable_iterator()

      if (self.params['mode'] != 'infer'
          or self.params.get("style_input", None) == "wav"):
        text, text_length, spec, stop_token_target, spec_length = self._iterator\
                                                                      .get_next()
        # need to explicitly set batch size dimension
        # (it is employed in the model)
        spec.set_shape(
            [self.params['batch_size'], None, num_audio_features]
        )
        stop_token_target.set_shape([self.params['batch_size'], None])
        spec_length = tf.reshape(spec_length, [self.params['batch_size']])
      else:
        text, text_length = self._iterator.get_next()
      text.set_shape([self.params['batch_size'], None])
      text_length = tf.reshape(text_length, [self.params['batch_size']])

      self._input_tensors = {}
      self._input_tensors["source_tensors"] = [text, text_length]
      if self.params.get("style_input", None) == "wav":
        # mag - not supported currently
        if not self._mel and not self._both:
          raise ValueError(
              "GST is currently only supported on mel and both output modes.")
        # mel
        mel_spec = spec
        if self._both:
          mel_spec, _ = tf.split(
              mel_spec,
              [self.params['num_audio_features']['mel'],
               self.params['num_audio_features']['magnitude']],
              axis=2
          )
        self._input_tensors["source_tensors"].extend([mel_spec, spec_length])
        # both
      if self.params['mode'] != 'infer':
        self._input_tensors['target_tensors'] = [
            spec, stop_token_target, spec_length
        ]

[docs]  def _parse_audio_transcript_element(self, element):
    """Parses tf.data element from TextLineDataset into audio and text.

    Args:
      element: tf.data element from TextLineDataset.

    Returns:
      tuple: text_input text as `np.array` of ids, text_input length,
      target audio features as `np.array`, stop token targets as `np.array`,
      length of target sequence.

    """
    audio_filename, transcript = element
    transcript = transcript.lower()
    if six.PY2:
      audio_filename = unicode(audio_filename, "utf-8")
      transcript = unicode(transcript, "utf-8")
    elif not isinstance(transcript, string_types):
      audio_filename = str(audio_filename, "utf-8")
      transcript = str(transcript, "utf-8")
    text_input = np.array(
        [self.params['char2idx'][c] for c in transcript]
    )
    pad_to = self.params.get('pad_to', 8)
    if self.params.get("pad_EOS", True):
      num_pad = pad_to - ((len(text_input) + 2) % pad_to)
      text_input = np.pad(
          text_input, ((1, 1)),
          "constant",
          constant_values=(
              (self.params['char2idx']["<s>"], self.params['char2idx']["</s>"])
          )
      )
      text_input = np.pad(
          text_input, ((0, num_pad)),
          "constant",
          constant_values=self.params['char2idx']["<p>"]
      )

    # Mainly used for GST
    if "wavs" in audio_filename:
      file_path = os.path.join(
          self.params['dataset_location'], audio_filename + ".wav"
      )
    # Default path for LJ and MAILABS
    else:
      file_path = os.path.join(
          self.params['dataset_location'], "wavs", audio_filename + ".wav"
      )
    if self._mel:
      features_type = "mel_htk"
      if self.params.get('mel_type', 'htk') == 'slaney':
        features_type = "mel_slaney"
    else:
      features_type = self.params['output_type']

    if self.use_cache and audio_filename in self._cache:
      spectrogram = self._cache[audio_filename]
    else:
      spectrogram = get_speech_features_from_file(
          file_path,
          self.params['num_audio_features'],
          features_type=features_type,
          n_fft=self._n_fft,
          mag_power=self.params.get('mag_power', 2),
          feature_normalize=self.params["feature_normalize"],
          mean=self.params.get("feature_normalize_mean", 0.),
          std=self.params.get("feature_normalize_std", 1.),
          trim=self.params.get("trim", False),
          data_min=self.params.get("data_min", 1e-5),
          mel_basis=self._mel_basis
      )

      if self.use_cache:
        self._cache[audio_filename] = spectrogram

    if self._both:
      mel_spectrogram, spectrogram = spectrogram
      if self._exp_mag:
        spectrogram = np.exp(spectrogram)

    stop_token_target = np.zeros(
        [len(spectrogram)], dtype=self.params['dtype'].as_numpy_dtype()
    )
    if self.params.get("pad_EOS", True):
      num_pad = pad_to - ((len(spectrogram) + 1) % pad_to) + 1

      data_min = self.params.get("data_min", 1e-5)
      if isinstance(data_min, dict):
        pad_value_mel = self.params.get("pad_value", np.log(data_min["mel"]))
        if self._exp_mag:
          pad_value_mag = self.params.get("pad_value", data_min["magnitude"])
        else:
          pad_value_mag = self.params.get("pad_value", np.log(data_min["magnitude"]))
      else:
        pad_value = self.params.get("pad_value", np.log(data_min))
        if self.params["feature_normalize"]:
          pad_value = self._normalize(pad_value)
          pad_value_mel = pad_value_mag = pad_value

      if self._both:
        mel_spectrogram = np.pad(
            mel_spectrogram,
            # ((8, num_pad), (0, 0)),
            ((0, num_pad), (0, 0)),
            "constant",
            constant_values=pad_value_mel
        )
        spectrogram = np.pad(
            spectrogram,
            # ((8, num_pad), (0, 0)),
            ((0, num_pad), (0, 0)),
            "constant",
            constant_values=pad_value_mag
        )
        spectrogram = np.concatenate((mel_spectrogram, spectrogram), axis=1)
      else:
        spectrogram = np.pad(
            spectrogram,
            # ((8, num_pad), (0, 0)),
            ((0, num_pad), (0, 0)),
            "constant",
            constant_values=pad_value
        )
      stop_token_target = np.pad(
          stop_token_target, ((0, num_pad)), "constant", constant_values=1
      )
    else:
      stop_token_target[-1] = 1.

    assert len(text_input) % pad_to == 0
    assert len(spectrogram) % pad_to == 0
    return np.int32(text_input), \
           np.int32([len(text_input)]), \
           spectrogram.astype(self.params['dtype'].as_numpy_dtype()), \
           stop_token_target.astype(self.params['dtype'].as_numpy_dtype()), \
           np.int32([len(spectrogram)])

[docs]  def _parse_transcript_element(self, transcript):
    """Parses text from file and returns array of text features.

    Args:
      transcript: the string to parse.

    Returns:
      tuple: target text as `np.array` of ids, target text length.
    """

    if six.PY2:
      transcript = unicode(transcript, "utf-8")
    elif not isinstance(transcript, string_types):
      transcript = str(transcript, "utf-8")
    transcript = transcript.lower()

    text_input = np.array(
        [self.params['char2idx'].get(c,3) for c in transcript]
    )
    pad_to = self.params.get('pad_to', 8)
    if self.params.get("pad_EOS", True):
      num_pad = pad_to - ((len(text_input) + 2) % pad_to)
      text_input = np.pad(
          text_input, ((1, 1)),
          "constant",
          constant_values=(
              (self.params['char2idx']["<s>"], self.params['char2idx']["</s>"])
          )
      )
      text_input = np.pad(
          text_input, ((0, num_pad)),
          "constant",
          constant_values=self.params['char2idx']["<p>"]
      )

    return np.int32(text_input), \
           np.int32([len(text_input)])

[docs]  def parse_text_output(self, text):
    text = "".join([self.params['idx2char'][k] for k in text])
    return text

[docs]  def create_interactive_placeholders(self):
    self._text = tf.placeholder(
        dtype=tf.int32,
        shape=[self.params["batch_size"], None]
    )
    self._text_length = tf.placeholder(
        dtype=tf.int32,
        shape=[self.params["batch_size"]]
    )

    self._input_tensors = {}
    self._input_tensors["source_tensors"] = [self._text, self._text_length]

[docs]  def create_feed_dict(self, model_in):
    """ Creates the feed dict for interactive infer

    Args:
      model_in (str): The string to be spoken.

    Returns:
      feed_dict (dict): Dictionary with values for the placeholders.
    """
    text = []
    text_length = []
    for line in model_in:
      if not isinstance(line, string_types):
        raise ValueError(
            "Text2Speech's interactive inference mode only supports string.",
            "Got {}". format(type(line))
        )
      text_a, text_length_a = self._parse_transcript_element(line)
      text.append(text_a)
      text_length.append(text_length_a)
    max_len = np.max(text_length)
    for i, line in enumerate(text):
      line = np.pad(
          line, ((0, max_len-len(line))),
          "constant", constant_values=self.params['char2idx']["<p>"]
      )
      text[i] = line

    text = np.reshape(text, [self.params["batch_size"], -1])
    text_length = np.reshape(text_length, [self.params["batch_size"]])

    feed_dict = {
        self._text: text,
        self._text_length: text_length,
    }
    return feed_dict

  @property
  def input_tensors(self):
    return self._input_tensors

  @property
  def sampling_rate(self):
    return self._sampling_rate

  @property
  def n_fft(self):
    return self._n_fft

[docs]  def get_size_in_samples(self):
    """Returns the number of audio files."""
    return len(self._files)

[docs]  def get_magnitude_spec(self, spectrogram, is_mel=False):
    """Returns an energy magnitude spectrogram. The processing depends on the
    data layer params.

    Args:
      spectrogram: output spec from model

    Returns:
      mag_spec: mag spec
    """
    spectrogram = spectrogram.astype(float)
    if self._mel or (is_mel and self._both):
      htk = True
      norm = None
      if self.params.get('mel_type', 'htk') == 'slaney':
        htk = False
        norm = 1
      n_feats = self.params['num_audio_features']
      if self._both:
        n_feats = n_feats["mel"]
      return inverse_mel(
          spectrogram,
          fs=self._sampling_rate,
          n_fft=self._n_fft,
          n_mels=n_feats,
          power=self.params.get('mag_power', 2),
          feature_normalize=self.params["feature_normalize"],
          mean=self.params.get("feature_normalize_mean", 0.),
          std=self.params.get("feature_normalize_std", 1.),
          mel_basis=self._mel_basis,
          htk=htk,
          norm=norm
      )
    # Else it is a mag spec
    else:
      if self.params["feature_normalize"]:
        spectrogram = self._denormalize(spectrogram)
      n_feats = self.params['num_audio_features']
      data_min = self.params.get("data_min", 1e-5)
      if self._both:
        n_feats = n_feats["magnitude"]
        if isinstance(data_min, dict):
          data_min = data_min["magnitude"]
        if not self._exp_mag:
          data_min = np.log(data_min)
      else:
        data_min = np.log(data_min)
      # Ensure that num_features is consistent with n_fft
      if n_feats < self._n_fft // 2 + 1:
        num_pad = (self._n_fft // 2 + 1) - spectrogram.shape[1]
        spectrogram = np.pad(
            spectrogram,
            ((0, 0), (0, num_pad)),
            "constant",
            constant_values=data_min
        )
      mag_spec = spectrogram * 1.0 / self.params.get('mag_power', 2)
      if not self._both and not self._exp_mag:
        mag_spec = np.exp(mag_spec)
      return mag_spec

  def _normalize(self, spectrogram):
    return normalize(
        spectrogram,
        mean=self.params.get("feature_normalize_mean", 0.),
        std=self.params.get("feature_normalize_std", 1.)
    )

  def _denormalize(self, spectrogram):
    return denormalize(
        spectrogram,
        mean=self.params.get("feature_normalize_mean", 0.),
        std=self.params.get("feature_normalize_std", 1.)
    )