Source code for models.text2speech

# Copyright (c) 2019 NVIDIA Corporation
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import librosa
import matplotlib as mpl
import numpy as np
from scipy.io.wavfile import write
from six import BytesIO
from six.moves import range

mpl.use('Agg')
import matplotlib.pyplot as plt
import tensorflow as tf

from .encoder_decoder import EncoderDecoderModel


[docs]def plot_spectrograms( specs, titles, stop_token_pred, audio_length, logdir, train_step, stop_token_target=None, number=0, append=False, save_to_tensorboard=False ): """ Helper function to create a image to be logged to disk or a tf.Summary to be logged to tensorboard. Args: specs (array): array of images to show titles (array): array of titles. Must match lengths of specs array stop_token_pred (np.array): np.array of size [time, 1] containing the stop token predictions from the model. audio_length (int): lenth of the predicted spectrogram logdir (str): dir to save image file is save_to_tensorboard is disabled. train_step (int): current training step stop_token_target (np.array): np.array of size [time, 1] containing the stop token target. number (int): Current sample number (used if evaluating more than 1 sample from a batch) append (str): Optional string to append to file name eg. train, eval, infer save_to_tensorboard (bool): If False, the created image is saved to the logdir as a png file. If True, the function returns a tf.Summary object containing the image and will be logged to the current tensorboard file. Returns: tf.Summary or None """ num_figs = len(specs) + 1 fig, ax = plt.subplots(nrows=num_figs, figsize=(8, num_figs * 3)) for i, (spec, title) in enumerate(zip(specs, titles)): spec = np.pad(spec, ((1, 1), (1, 1)), "constant", constant_values=0.) spec = spec.astype(float) colour = ax[i].imshow( spec.T, cmap='viridis', interpolation=None, aspect='auto' ) ax[i].invert_yaxis() ax[i].set_title(title) fig.colorbar(colour, ax=ax[i]) if stop_token_target is not None: stop_token_target = stop_token_target.astype(float) ax[-1].plot(stop_token_target, 'r.') stop_token_pred = stop_token_pred.astype(float) ax[-1].plot(stop_token_pred, 'g.') ax[-1].axvline(x=audio_length) ax[-1].set_xlim(0, len(specs[0])) ax[-1].set_title("stop token") plt.xlabel('time') plt.tight_layout() cb = fig.colorbar(colour, ax=ax[-1]) cb.remove() if save_to_tensorboard: tag = "{}_image".format(append) iostream = BytesIO() fig.savefig(iostream, dpi=300) summary = tf.Summary.Image( encoded_image_string=iostream.getvalue(), height=int(fig.get_figheight() * 300), width=int(fig.get_figwidth() * 300) ) summary = tf.Summary.Value(tag=tag, image=summary) plt.close(fig) return summary else: if append: name = '{}/Output_step{}_{}_{}.png'.format( logdir, train_step, number, append ) else: name = '{}/Output_step{}_{}.png'.format(logdir, train_step, number) if logdir[0] != '/': name = "./" + name # save fig.savefig(name, dpi=300) plt.close(fig) return None
[docs]def save_audio( magnitudes, logdir, step, sampling_rate, n_fft=1024, mode="train", number=0, save_format="tensorboard", power=1.5, gl_iters=50, verbose=True, max_normalization=False ): """ Helper function to create a wav file to be logged to disk or a tf.Summary to be logged to tensorboard. Args: magnitudes (np.array): np.array of size [time, n_fft/2 + 1] containing the energy spectrogram. logdir (str): dir to save image file is save_to_tensorboard is disabled. step (int): current training step n_fft (int): number of filters for fft and ifft. sampling_rate (int): samplng rate in Hz of the audio to be saved. number (int): Current sample number (used if evaluating more than 1 sample mode (str): Optional string to append to file name eg. train, eval, infer from a batch) save_format: save_audio can either return the np.array containing the generated sound, log the wav file to the disk, or return a tensorboard summary object. Each method can be enabled by passing save_format as "np.array", "tensorboard", or "disk" respectively. Returns: tf.Summary or None """ # Clip signal max and min if np.min(magnitudes) < 0 or np.max(magnitudes) > 255: if verbose: print("WARNING: {} audio was clipped at step {}".format(mode.capitalize(), step)) magnitudes = np.clip(magnitudes, a_min=0, a_max=255) signal = griffin_lim(magnitudes.T ** power, n_iters=gl_iters, n_fft=n_fft) if max_normalization: signal /= np.max(np.abs(signal)) if save_format == "np.array": return signal elif save_format == "tensorboard": tag = "{}_audio".format(mode) iostream = BytesIO() write(iostream, sampling_rate, signal) summary = tf.Summary.Audio(encoded_audio_string=iostream.getvalue()) summary = tf.Summary.Value(tag=tag, audio=summary) return summary elif save_format == "disk": file_name = '{}/sample_step{}_{}_{}.wav'.format(logdir, step, number, mode) if logdir[0] != '/': file_name = "./" + file_name write(file_name, sampling_rate, signal) return None else: print(( "WARN: The save format passed to save_audio was not understood. No " "sound files will be saved for the current step. " "Received '{}'." "Expected one of 'np.array', 'tensorboard', or 'disk'" ).format(save_format)) return None
[docs]def griffin_lim(magnitudes, n_iters=50, n_fft=1024): """ Griffin-Lim algorithm to convert magnitude spectrograms to audio signals """ phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape)) complex_spec = magnitudes * phase signal = librosa.istft(complex_spec) if not np.isfinite(signal).all(): print("WARNING: audio was not finite, skipping audio saving") return np.array([0]) for _ in range(n_iters): _, phase = librosa.magphase(librosa.stft(signal, n_fft=n_fft)) complex_spec = magnitudes * phase signal = librosa.istft(complex_spec) return signal
[docs]class Text2Speech(EncoderDecoderModel): """ Text-to-speech data layer. """
[docs] @staticmethod def get_required_params(): return dict( EncoderDecoderModel.get_required_params(), **{ "save_to_tensorboard": bool, } )
def __init__(self, params, mode="train", hvd=None): super(Text2Speech, self).__init__(params, mode=mode, hvd=hvd) self._save_to_tensorboard = self.params["save_to_tensorboard"]
[docs] def print_logs(self, mode, specs, titles, stop_token_pred, stop_target, audio_length, step, predicted_final_spec, predicted_mag_spec=None): """ Save audio files and plots. Args: mode: "train" or "eval". specs: spectograms to plot. titles: spectogram titles. stop_token_pred: stop token prediction. stop_target: stop target. audio_length: length of the audio. step: current step. predicted_final_spec: predicted mel spectogram. predicted_mag_spec: predicted magnitude spectogram. Returns: Dictionary to log. """ dict_to_log = {} im_summary = plot_spectrograms( specs, titles, stop_token_pred, audio_length, self.params["logdir"], step, append=mode, save_to_tensorboard=self._save_to_tensorboard, stop_token_target=stop_target ) dict_to_log['image'] = im_summary if audio_length < 3: return {} if self._save_to_tensorboard: save_format = "tensorboard" else: save_format = "disk" if predicted_mag_spec is not None: predicted_mag_spec = predicted_mag_spec[:audio_length - 1, :] if self.get_data_layer()._exp_mag is False: predicted_mag_spec = np.exp(predicted_mag_spec) predicted_mag_spec = self.get_data_layer().get_magnitude_spec(predicted_mag_spec) wav_summary = save_audio( predicted_mag_spec, self.params["logdir"], step, n_fft=self.get_data_layer().n_fft, sampling_rate=self.get_data_layer().sampling_rate, mode=mode + "_mag", save_format=save_format ) dict_to_log['audio_mag'] = wav_summary predicted_final_spec = predicted_final_spec[:audio_length - 1, :] predicted_final_spec = self.get_data_layer().get_magnitude_spec( predicted_final_spec, is_mel=True ) wav_summary = save_audio( predicted_final_spec, self.params["logdir"], step, n_fft=self.get_data_layer().n_fft, sampling_rate=self.get_data_layer().sampling_rate, mode=mode, save_format=save_format, max_normalization=self.get_data_layer().max_normalization ) dict_to_log['audio'] = wav_summary if self._save_to_tensorboard: return dict_to_log return {}
[docs] def infer(self, input_values, output_values): if self.on_horovod: raise ValueError("Inference is not supported on horovod") return [input_values, output_values]
[docs] def evaluate(self, input_values, output_values): # Need to reduce amount of data sent for horovod # Use last element idx = -1 output_values = [(item[idx]) for item in output_values] input_values = { key: [value[0][idx], value[1][idx]] for key, value in input_values.items() } return [input_values, output_values]
[docs] def get_alignments(self, attention_mask): """ Get attention alignment plots. Args: attention_mask: attention alignment. Returns: Specs and titles to plot. """ raise NotImplementedError()
[docs] def finalize_inference(self, results_per_batch, output_file): print("output_file is ignored for tts") print("results are logged to the logdir") batch_size = len(results_per_batch[0][0]["source_tensors"][0]) for i, sample in enumerate(results_per_batch): output_values = sample[1] predicted_final_specs = output_values[1] attention_mask = output_values[2] stop_tokens = output_values[3] sequence_lengths = output_values[4] for j in range(len(predicted_final_specs)): predicted_final_spec = predicted_final_specs[j] attention_mask_sample = attention_mask[j] stop_tokens_sample = stop_tokens[j] specs = [predicted_final_spec] titles = ["final spectrogram"] audio_length = sequence_lengths[j] alignment_specs, alignment_titles = self.get_alignments(attention_mask_sample) specs += alignment_specs titles += alignment_titles if "mel" in self.get_data_layer().params["output_type"]: mag_spec = self.get_data_layer().get_magnitude_spec(predicted_final_spec) log_mag_spec = np.log(np.clip(mag_spec, a_min=1e-5, a_max=None)) specs.append(log_mag_spec) titles.append("magnitude spectrogram") elif "both" in self.get_data_layer().params["output_type"]: mag_spec = self.get_data_layer().get_magnitude_spec(predicted_final_spec, is_mel=True) specs.append(mag_spec) titles.append("mag spectrogram from mel basis") specs.append(output_values[5][j]) titles.append("mag spectrogram from proj layer") im_summary = plot_spectrograms( specs, titles, stop_tokens_sample, audio_length, self.params["logdir"], 0, number=i * batch_size + j, append="infer" ) if audio_length > 2: if "both" in self.get_data_layer().params["output_type"]: predicted_mag_spec = output_values[5][j][:audio_length - 1, :] wav_summary = save_audio( predicted_mag_spec, self.params["logdir"], 0, n_fft=self.get_data_layer().n_fft, sampling_rate=self.get_data_layer().sampling_rate, mode="infer_mag", number=i * batch_size + j, save_format="disk", max_normalization=self.get_data_layer().max_normalization ) predicted_final_spec = predicted_final_spec[:audio_length - 1, :] predicted_final_spec = self.get_data_layer().get_magnitude_spec(predicted_final_spec, is_mel=True) wav_summary = save_audio( predicted_final_spec, self.params["logdir"], 0, n_fft=self.get_data_layer().n_fft, sampling_rate=self.get_data_layer().sampling_rate, mode="infer", number=i * batch_size + j, save_format="disk", max_normalization=self.get_data_layer().max_normalization )
[docs] def finalize_evaluation(self, results_per_batch, training_step=None, samples_count=1): sample = results_per_batch[0] input_values = sample[0] output_values = sample[1] y_sample, stop_target = input_values["target_tensors"] predicted_spec = output_values[0] predicted_final_spec = output_values[1] attention_mask = output_values[2] stop_token_pred = output_values[3] audio_length = output_values[4] max_length = np.max([ y_sample.shape[0], predicted_final_spec.shape[0], ]) predictions_pad = np.zeros( [max_length - np.shape(predicted_final_spec)[0], np.shape(predicted_final_spec)[-1]] ) stop_token_pred_pad = np.zeros( [max_length - np.shape(predicted_final_spec)[0], 1] ) spec_pad = np.zeros([max_length - np.shape(y_sample)[0], np.shape(y_sample)[-1]]) stop_token_pad = np.zeros([max_length - np.shape(y_sample)[0]]) predicted_spec = np.concatenate( [predicted_spec, predictions_pad], axis=0 ) predicted_final_spec = np.concatenate( [predicted_final_spec, predictions_pad], axis=0 ) stop_token_pred = np.concatenate( [stop_token_pred, stop_token_pred_pad], axis=0 ) y_sample = np.concatenate([y_sample, spec_pad], axis=0) stop_target = np.concatenate([stop_target, stop_token_pad], axis=0) specs = [ y_sample, predicted_spec, predicted_final_spec ] titles = [ "training data", "decoder results", "post net results" ] alignment_specs, alignment_titles = self.get_alignments(attention_mask) specs += alignment_specs titles += alignment_titles predicted_mag_spec = None if "both" in self.get_data_layer().params["output_type"]: n_feats = self.get_data_layer().params["num_audio_features"] predicted_mag_spec = output_values[5] mag_pred_pad = np.zeros( [max_length - np.shape(predicted_mag_spec)[0], n_feats["magnitude"]] ) predicted_mag_spec = np.concatenate([predicted_mag_spec, mag_pred_pad], axis=0) specs.append(predicted_mag_spec) titles.append("magnitude spectrogram") mel, mag = np.split( y_sample, [n_feats["mel"]], axis=1 ) specs.insert(0, mel) specs[1] = mag titles.insert(0, "target mel") titles[1] = "target mag" return self.print_logs( mode="eval", specs=specs, titles=titles, stop_token_pred=stop_token_pred, stop_target=stop_target[0], audio_length=audio_length, step=training_step, predicted_final_spec=predicted_final_spec, predicted_mag_spec=predicted_mag_spec )
[docs] def maybe_print_logs(self, input_values, output_values, training_step): spec, stop_target, _ = input_values['target_tensors'] predicted_decoder_spec = output_values[0] predicted_final_spec = output_values[1] attention_mask = output_values[2] stop_token_pred = output_values[3] y_sample = spec[0] stop_target = stop_target[0] predicted_spec = predicted_decoder_spec[0] predicted_final_spec = predicted_final_spec[0] alignment = attention_mask[0] stop_token_pred = stop_token_pred[0] audio_length = output_values[4][0] specs = [ y_sample, predicted_spec, predicted_final_spec ] titles = [ "training data", "decoder results", "post net results" ] alignment_specs, alignment_titles = self.get_alignments(alignment) specs += alignment_specs titles += alignment_titles predicted_mag_spec = None if "both" in self.get_data_layer().params["output_type"]: predicted_mag_spec = output_values[5][0] specs.append(predicted_mag_spec) titles.append("magnitude spectrogram") n_feats = self.get_data_layer().params["num_audio_features"] mel, mag = np.split( y_sample, [n_feats["mel"]], axis=1 ) specs.insert(0, mel) specs[1] = mag titles.insert(0, "target mel") titles[1] = "target mag" return self.print_logs( mode="train", specs=specs, titles=titles, stop_token_pred=stop_token_pred, stop_target=stop_target, audio_length=audio_length, step=training_step, predicted_final_spec=predicted_final_spec, predicted_mag_spec=predicted_mag_spec )