# Copyright (c) 2018 NVIDIA Corporation
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
import os
import six
import librosa
import numpy as np
import tensorflow as tf
import pandas as pd
from six import string_types
from open_seq2seq.data.data_layer import DataLayer
from open_seq2seq.data.utils import load_pre_existing_vocabulary
from .speech_utils import get_speech_features_from_file,\
inverse_mel, normalize, denormalize
[docs]class Text2SpeechDataLayer(DataLayer):
"""
Text-to-speech data layer class
"""
[docs] @staticmethod
def get_required_params():
return dict(
DataLayer.get_required_params(), **{
'dataset_location': str,
'dataset': ['LJ', 'MAILABS'],
'num_audio_features': None,
'output_type': ['magnitude', 'mel', 'both'],
'vocab_file': str,
'dataset_files': list,
'feature_normalize': bool,
}
)
[docs] @staticmethod
def get_optional_params():
return dict(
DataLayer.get_optional_params(), **{
'pad_to': int,
'mag_power': int,
'pad_EOS': bool,
'pad_value': float,
'feature_normalize_mean': float,
'feature_normalize_std': float,
'trim': bool,
'data_min': None,
'duration_min': int,
'duration_max': int,
'mel_type': ['slaney', 'htk'],
"exp_mag": bool,
'style_input': [None, 'wav'],
'n_samples_train': int,
'n_samples_eval': int,
'n_fft': int,
'fmax': float,
'max_normalization': bool,
'use_cache': bool
}
)
[docs] def __init__(self, params, model, num_workers=None, worker_id=None):
"""Text-to-speech data layer constructor.
See parent class for arguments description.
Config parameters:
* **dataset** (str) --- The dataset to use. Currently 'LJ' for the LJSpeech
1.1 dataset is supported.
* **num_audio_features** (int) --- number of audio features to extract.
* **output_type** (str) --- could be either "magnitude", or "mel".
* **vocab_file** (str) --- path to vocabulary file.
* **dataset_files** (list) --- list with paths to all dataset .csv files.
File is assumed to be separated by "|".
* **dataset_location** (string) --- string with path to directory where wavs
are stored.
* **feature_normalize** (bool) --- whether to normlize the data with a
preset mean and std
* **feature_normalize_mean** (bool) --- used for feature normalize.
Defaults to 0.
* **feature_normalize_std** (bool) --- used for feature normalize.
Defaults to 1.
* **mag_power** (int) --- the power to which the magnitude spectrogram is
scaled to. Defaults to 1.
1 for energy spectrogram
2 for power spectrogram
Defaults to 2.
* **pad_EOS** (bool) --- whether to apply EOS tokens to both the text and
the speech signal. Will pad at least 1 token regardless of pad_to value.
Defaults to True.
* **pad_value** (float) --- The value we pad the spectrogram with. Defaults
to np.log(data_min).
* **pad_to** (int) --- we pad such that the resulting datapoint is a
multiple of pad_to.
Defaults to 8.
* **trim** (bool) --- Whether to trim silence via librosa or not. Defaults
to False.
* **data_min** (float) --- min clip value prior to taking the log. Defaults
to 1e-5. Please change to 1e-2 if using htk mels.
* **duration_min** (int) --- Minimum duration in steps for speech signal.
All signals less than this will be cut from the training set. Defaults to
0.
* **duration_max** (int) --- Maximum duration in steps for speech signal.
All signals greater than this will be cut from the training set. Defaults
to 4000.
* **mel_type** (str) --- One of ['slaney', 'htk']. Decides which algorithm to
use to compute mel specs.
Defaults to htk.
* **style_input** (str) --- Can be either None or "wav". Must be set to "wav"
for GST. Defaults to None.
* **n_samples_train** (int) --- number of the shortest examples to use for training.
* **n_samples_eval** (int) --- number of the shortest examples to use for evaluation.
* **n_fft** (int) --- FFT window size.
* **fmax** (float) --- highest frequency to use.
* **max_normalization** (bool) --- whether to divide the final audio signal
by its' absolute maximum.
* **use_cache** (bool) --- whether to use cache.
"""
super(Text2SpeechDataLayer, self).__init__(
params,
model,
num_workers,
worker_id
)
self.use_cache = self.params.get('use_cache', False)
self._cache = {}
names = ['wav_filename', 'raw_transcript', 'transcript']
sep = '\x7c'
header = None
if self.params["dataset"] == "LJ":
self._sampling_rate = 22050
self._n_fft = self.params.get("n_fft", 1024)
elif self.params["dataset"] == "MAILABS":
self._sampling_rate = 16000
self._n_fft = 800
# Character level vocab
self.params['char2idx'] = load_pre_existing_vocabulary(
self.params['vocab_file'],
min_idx=3,
read_chars=True,
)
# Add the pad, start, and end chars
self.params['char2idx']['<p>'] = 0
self.params['char2idx']['<s>'] = 1
self.params['char2idx']['</s>'] = 2
self.params['idx2char'] = {i: w for w, i in self.params['char2idx'].items()}
self.params['src_vocab_size'] = len(self.params['char2idx'])
self.max_normalization = self.params.get('max_normalization', False)
n_feats = self.params['num_audio_features']
if "both" in self.params["output_type"]:
self._both = True
if self.params["feature_normalize"]:
raise ValueError(
"feature normalize is not currently enabled for both mode"
)
if not isinstance(n_feats, dict):
raise ValueError(
"num_audio_features must be a dictionary for both mode"
)
else:
if ("mel" not in n_feats and
"magnitude" not in n_feats):
raise ValueError(
"num_audio_features must contain mel and magnitude keys"
)
elif (not isinstance(n_feats["mel"], int) or
not isinstance(n_feats["magnitude"], int)):
raise ValueError(
"num_audio_features must be a int"
)
n_mels = n_feats['mel']
data_min = self.params.get("data_min", None)
if data_min is not None:
if not isinstance(data_min, dict):
raise ValueError(
"data_min must be a dictionary for both mode"
)
else:
if "mel" not in data_min and "magnitude" not in data_min:
raise ValueError(
"data_min must contain mel and magnitude keys"
)
elif (not isinstance(data_min["mel"], float) or
not isinstance(data_min["magnitude"], float)):
raise ValueError(
"data_min must be a float"
)
self._exp_mag = self.params.get("exp_mag", True)
else:
if not isinstance(n_feats, int):
raise ValueError(
"num_audio_features must be a float for mel or magnitude mode"
)
if not isinstance(self.params.get("data_min",1.0), float):
raise ValueError(
"data_min must be a float for mel or magnitude mode"
)
self._both = False
self._exp_mag = False
n_mels = n_feats
self._mel = "mel" in self.params["output_type"]
if self._mel or self._both:
htk = True
norm = None
if self.params.get('mel_type', 'htk') == 'slaney':
htk = False
norm = 1
self._mel_basis = librosa.filters.mel(
sr=self._sampling_rate,
n_fft=self._n_fft,
n_mels=n_mels,
htk=htk,
norm=norm,
fmax=self.params.get('fmax', None)
)
else:
self._mel_basis = None
if self.params["interactive"]:
return
# Load csv files
self._files = None
for csvs in params['dataset_files']:
files = pd.read_csv(
csvs,
encoding='utf-8',
sep=sep,
header=header,
names=names,
quoting=3
)
if self._files is None:
self._files = files
else:
self._files = self._files.append(files)
if self.params['mode'] == 'train' and 'n_samples_train' in self.params:
indices = self._files['transcript'].str.len().sort_values().index
self._files = self._files.reindex(indices)
n_samples = self.params.get('n_samples_train')
print('Using just the {} shortest samples'.format(n_samples))
self._files = self._files.iloc[:n_samples]
if self.params['mode'] == 'eval':
indices = self._files['transcript'].str.len().sort_values().index
self._files = self._files.reindex(indices)
if 'n_samples_eval' in self.params:
n_samples = self.params['n_samples_eval']
self._files = self._files.iloc[:n_samples]
if (self.params['mode'] != 'infer'
or self.params.get("style_input", None) == "wav"):
cols = ['wav_filename', 'transcript']
else:
cols = 'transcript'
all_files = self._files.loc[:, cols].values
self._files = self.split_data(all_files)
self._size = self.get_size_in_samples()
self._dataset = None
self._iterator = None
self._input_tensors = None
[docs] def split_data(self, data):
if self.params['mode'] != 'train' and self._num_workers is not None:
size = len(data)
start = size // self._num_workers * self._worker_id
if self._worker_id == self._num_workers - 1:
end = size
else:
end = size // self._num_workers * (self._worker_id + 1)
return data[start:end]
return data
@property
def iterator(self):
return self._iterator
[docs] def build_graph(self):
with tf.device('/cpu:0'):
"""Builds data reading graph."""
self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
if self.params['shuffle']:
self._dataset = self._dataset.shuffle(self._size)
self._dataset = self._dataset.repeat()
if self._both:
num_audio_features = self.params['num_audio_features']['mel']
num_audio_features += self.params['num_audio_features']['magnitude']
else:
num_audio_features = self.params['num_audio_features']
if (self.params['mode'] != 'infer'
or self.params.get("style_input", None) == "wav"):
self._dataset = self._dataset.map(
lambda line: tf.py_func(
self._parse_audio_transcript_element,
[line],
[tf.int32, tf.int32, self.params['dtype'], self.params['dtype'],\
tf.int32],
stateful=False,
),
num_parallel_calls=8,
)
if (self.params.get("duration_max", None) or
self.params.get("duration_max", None)):
self._dataset = self._dataset.filter(
lambda txt, txt_len, spec, stop, spec_len:
tf.logical_and(
tf.less_equal(
spec_len,
self.params.get("duration_max", 4000)
),
tf.greater_equal(
spec_len,
self.params.get("duration_min", 0)
)
)
)
if self._both:
default_pad_value = 0.
else:
default_pad_value = np.log(self.params.get("data_min", 1e-5))
pad_value = self.params.get("pad_value", default_pad_value)
if self.params["feature_normalize"]:
pad_value = self._normalize(pad_value)
self._dataset = self._dataset.padded_batch(
self.params['batch_size'],
padded_shapes=(
[None], 1, [None, num_audio_features], [None], 1
),
padding_values=(
0, 0, tf.cast(pad_value, dtype=self.params['dtype']),
tf.cast(1., dtype=self.params['dtype']), 0
)
)
else:
self._dataset = self._dataset.map(
lambda line: tf.py_func(
self._parse_transcript_element,
[line],
[tf.int32, tf.int32],
stateful=False,
),
num_parallel_calls=8,
)
self._dataset = self._dataset.padded_batch(
self.params['batch_size'], padded_shapes=([None], 1)
)
self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE)\
.make_initializable_iterator()
if (self.params['mode'] != 'infer'
or self.params.get("style_input", None) == "wav"):
text, text_length, spec, stop_token_target, spec_length = self._iterator\
.get_next()
# need to explicitly set batch size dimension
# (it is employed in the model)
spec.set_shape(
[self.params['batch_size'], None, num_audio_features]
)
stop_token_target.set_shape([self.params['batch_size'], None])
spec_length = tf.reshape(spec_length, [self.params['batch_size']])
else:
text, text_length = self._iterator.get_next()
text.set_shape([self.params['batch_size'], None])
text_length = tf.reshape(text_length, [self.params['batch_size']])
self._input_tensors = {}
self._input_tensors["source_tensors"] = [text, text_length]
if self.params.get("style_input", None) == "wav":
# mag - not supported currently
if not self._mel and not self._both:
raise ValueError(
"GST is currently only supported on mel and both output modes.")
# mel
mel_spec = spec
if self._both:
mel_spec, _ = tf.split(
mel_spec,
[self.params['num_audio_features']['mel'],
self.params['num_audio_features']['magnitude']],
axis=2
)
self._input_tensors["source_tensors"].extend([mel_spec, spec_length])
# both
if self.params['mode'] != 'infer':
self._input_tensors['target_tensors'] = [
spec, stop_token_target, spec_length
]
[docs] def _parse_audio_transcript_element(self, element):
"""Parses tf.data element from TextLineDataset into audio and text.
Args:
element: tf.data element from TextLineDataset.
Returns:
tuple: text_input text as `np.array` of ids, text_input length,
target audio features as `np.array`, stop token targets as `np.array`,
length of target sequence.
"""
audio_filename, transcript = element
transcript = transcript.lower()
if six.PY2:
audio_filename = unicode(audio_filename, "utf-8")
transcript = unicode(transcript, "utf-8")
elif not isinstance(transcript, string_types):
audio_filename = str(audio_filename, "utf-8")
transcript = str(transcript, "utf-8")
text_input = np.array(
[self.params['char2idx'][c] for c in transcript]
)
pad_to = self.params.get('pad_to', 8)
if self.params.get("pad_EOS", True):
num_pad = pad_to - ((len(text_input) + 2) % pad_to)
text_input = np.pad(
text_input, ((1, 1)),
"constant",
constant_values=(
(self.params['char2idx']["<s>"], self.params['char2idx']["</s>"])
)
)
text_input = np.pad(
text_input, ((0, num_pad)),
"constant",
constant_values=self.params['char2idx']["<p>"]
)
# Mainly used for GST
if "wavs" in audio_filename:
file_path = os.path.join(
self.params['dataset_location'], audio_filename + ".wav"
)
# Default path for LJ and MAILABS
else:
file_path = os.path.join(
self.params['dataset_location'], "wavs", audio_filename + ".wav"
)
if self._mel:
features_type = "mel_htk"
if self.params.get('mel_type', 'htk') == 'slaney':
features_type = "mel_slaney"
else:
features_type = self.params['output_type']
if self.use_cache and audio_filename in self._cache:
spectrogram = self._cache[audio_filename]
else:
spectrogram = get_speech_features_from_file(
file_path,
self.params['num_audio_features'],
features_type=features_type,
n_fft=self._n_fft,
mag_power=self.params.get('mag_power', 2),
feature_normalize=self.params["feature_normalize"],
mean=self.params.get("feature_normalize_mean", 0.),
std=self.params.get("feature_normalize_std", 1.),
trim=self.params.get("trim", False),
data_min=self.params.get("data_min", 1e-5),
mel_basis=self._mel_basis
)
if self.use_cache:
self._cache[audio_filename] = spectrogram
if self._both:
mel_spectrogram, spectrogram = spectrogram
if self._exp_mag:
spectrogram = np.exp(spectrogram)
stop_token_target = np.zeros(
[len(spectrogram)], dtype=self.params['dtype'].as_numpy_dtype()
)
if self.params.get("pad_EOS", True):
num_pad = pad_to - ((len(spectrogram) + 1) % pad_to) + 1
data_min = self.params.get("data_min", 1e-5)
if isinstance(data_min, dict):
pad_value_mel = self.params.get("pad_value", np.log(data_min["mel"]))
if self._exp_mag:
pad_value_mag = self.params.get("pad_value", data_min["magnitude"])
else:
pad_value_mag = self.params.get("pad_value", np.log(data_min["magnitude"]))
else:
pad_value = self.params.get("pad_value", np.log(data_min))
if self.params["feature_normalize"]:
pad_value = self._normalize(pad_value)
pad_value_mel = pad_value_mag = pad_value
if self._both:
mel_spectrogram = np.pad(
mel_spectrogram,
# ((8, num_pad), (0, 0)),
((0, num_pad), (0, 0)),
"constant",
constant_values=pad_value_mel
)
spectrogram = np.pad(
spectrogram,
# ((8, num_pad), (0, 0)),
((0, num_pad), (0, 0)),
"constant",
constant_values=pad_value_mag
)
spectrogram = np.concatenate((mel_spectrogram, spectrogram), axis=1)
else:
spectrogram = np.pad(
spectrogram,
# ((8, num_pad), (0, 0)),
((0, num_pad), (0, 0)),
"constant",
constant_values=pad_value
)
stop_token_target = np.pad(
stop_token_target, ((0, num_pad)), "constant", constant_values=1
)
else:
stop_token_target[-1] = 1.
assert len(text_input) % pad_to == 0
assert len(spectrogram) % pad_to == 0
return np.int32(text_input), \
np.int32([len(text_input)]), \
spectrogram.astype(self.params['dtype'].as_numpy_dtype()), \
stop_token_target.astype(self.params['dtype'].as_numpy_dtype()), \
np.int32([len(spectrogram)])
[docs] def _parse_transcript_element(self, transcript):
"""Parses text from file and returns array of text features.
Args:
transcript: the string to parse.
Returns:
tuple: target text as `np.array` of ids, target text length.
"""
if six.PY2:
transcript = unicode(transcript, "utf-8")
elif not isinstance(transcript, string_types):
transcript = str(transcript, "utf-8")
transcript = transcript.lower()
text_input = np.array(
[self.params['char2idx'].get(c,3) for c in transcript]
)
pad_to = self.params.get('pad_to', 8)
if self.params.get("pad_EOS", True):
num_pad = pad_to - ((len(text_input) + 2) % pad_to)
text_input = np.pad(
text_input, ((1, 1)),
"constant",
constant_values=(
(self.params['char2idx']["<s>"], self.params['char2idx']["</s>"])
)
)
text_input = np.pad(
text_input, ((0, num_pad)),
"constant",
constant_values=self.params['char2idx']["<p>"]
)
return np.int32(text_input), \
np.int32([len(text_input)])
[docs] def parse_text_output(self, text):
text = "".join([self.params['idx2char'][k] for k in text])
return text
[docs] def create_interactive_placeholders(self):
self._text = tf.placeholder(
dtype=tf.int32,
shape=[self.params["batch_size"], None]
)
self._text_length = tf.placeholder(
dtype=tf.int32,
shape=[self.params["batch_size"]]
)
self._input_tensors = {}
self._input_tensors["source_tensors"] = [self._text, self._text_length]
[docs] def create_feed_dict(self, model_in):
""" Creates the feed dict for interactive infer
Args:
model_in (str): The string to be spoken.
Returns:
feed_dict (dict): Dictionary with values for the placeholders.
"""
text = []
text_length = []
for line in model_in:
if not isinstance(line, string_types):
raise ValueError(
"Text2Speech's interactive inference mode only supports string.",
"Got {}". format(type(line))
)
text_a, text_length_a = self._parse_transcript_element(line)
text.append(text_a)
text_length.append(text_length_a)
max_len = np.max(text_length)
for i, line in enumerate(text):
line = np.pad(
line, ((0, max_len-len(line))),
"constant", constant_values=self.params['char2idx']["<p>"]
)
text[i] = line
text = np.reshape(text, [self.params["batch_size"], -1])
text_length = np.reshape(text_length, [self.params["batch_size"]])
feed_dict = {
self._text: text,
self._text_length: text_length,
}
return feed_dict
@property
def input_tensors(self):
return self._input_tensors
@property
def sampling_rate(self):
return self._sampling_rate
@property
def n_fft(self):
return self._n_fft
[docs] def get_size_in_samples(self):
"""Returns the number of audio files."""
return len(self._files)
[docs] def get_magnitude_spec(self, spectrogram, is_mel=False):
"""Returns an energy magnitude spectrogram. The processing depends on the
data layer params.
Args:
spectrogram: output spec from model
Returns:
mag_spec: mag spec
"""
spectrogram = spectrogram.astype(float)
if self._mel or (is_mel and self._both):
htk = True
norm = None
if self.params.get('mel_type', 'htk') == 'slaney':
htk = False
norm = 1
n_feats = self.params['num_audio_features']
if self._both:
n_feats = n_feats["mel"]
return inverse_mel(
spectrogram,
fs=self._sampling_rate,
n_fft=self._n_fft,
n_mels=n_feats,
power=self.params.get('mag_power', 2),
feature_normalize=self.params["feature_normalize"],
mean=self.params.get("feature_normalize_mean", 0.),
std=self.params.get("feature_normalize_std", 1.),
mel_basis=self._mel_basis,
htk=htk,
norm=norm
)
# Else it is a mag spec
else:
if self.params["feature_normalize"]:
spectrogram = self._denormalize(spectrogram)
n_feats = self.params['num_audio_features']
data_min = self.params.get("data_min", 1e-5)
if self._both:
n_feats = n_feats["magnitude"]
if isinstance(data_min, dict):
data_min = data_min["magnitude"]
if not self._exp_mag:
data_min = np.log(data_min)
else:
data_min = np.log(data_min)
# Ensure that num_features is consistent with n_fft
if n_feats < self._n_fft // 2 + 1:
num_pad = (self._n_fft // 2 + 1) - spectrogram.shape[1]
spectrogram = np.pad(
spectrogram,
((0, 0), (0, num_pad)),
"constant",
constant_values=data_min
)
mag_spec = spectrogram * 1.0 / self.params.get('mag_power', 2)
if not self._both and not self._exp_mag:
mag_spec = np.exp(mag_spec)
return mag_spec
def _normalize(self, spectrogram):
return normalize(
spectrogram,
mean=self.params.get("feature_normalize_mean", 0.),
std=self.params.get("feature_normalize_std", 1.)
)
def _denormalize(self, spectrogram):
return denormalize(
spectrogram,
mean=self.params.get("feature_normalize_mean", 0.),
std=self.params.get("feature_normalize_std", 1.)
)