# Copyright (c) 2018 NVIDIA Corporation
import os
import six
import numpy as np
import tensorflow as tf
import pandas as pd
from open_seq2seq.data.data_layer import DataLayer
from open_seq2seq.data.text2speech.speech_utils import \
get_speech_features_from_file
[docs]class WavenetDataLayer(DataLayer):
""" Text to speech data layer class for Wavenet """
[docs] @staticmethod
def get_required_params():
return dict(
DataLayer.get_required_params(), **{
"num_audio_features": int,
"dataset_files": list
}
)
[docs] @staticmethod
def get_optional_params():
return dict(
DataLayer.get_optional_params(), **{
"dataset_location": str
}
)
[docs] def __init__(self, params, model, num_workers=None, worker_id=None):
"""
Wavenet data layer constructor.
See parent class for arguments description.
Config parameters:
* **num_audio_features** (int) --- number of spectrogram audio features
* **dataset_files** (list) --- list with paths to all dataset .csv files
* **dataset_location** (str) --- string with path to directory where wavs
are stored
"""
super(WavenetDataLayer, self).__init__(
params,
model,
num_workers,
worker_id
)
if self.params.get("dataset_location", None) is None:
raise ValueError(
"dataset_location must be specified when using LJSpeech"
)
names = ["wav_filename", "raw_transcript", "transcript"]
sep = "\x7c"
header = None
self.sampling_rate = 22050
self.n_fft = 1024
self._files = None
for csvs in params["dataset_files"]:
files = pd.read_csv(
csvs,
encoding="utf-8",
sep=sep,
header=header,
names=names,
quoting=3
)
if self._files is None:
self._files = files
else:
self._files = self._files.append(files)
cols = "wav_filename"
if self._files is not None:
all_files = self._files.loc[:, cols].values
self._files = self.split_data(all_files)
self._size = self.get_size_in_samples()
self._dataset = None
self._iterator = None
self._input_tensors = None
@property
def input_tensors(self):
return self._input_tensors
[docs] def get_size_in_samples(self):
if self._files is not None:
return len(self._files)
else:
return 0
[docs] def split_data(self, data):
if self.params['mode'] != 'train' and self._num_workers is not None:
size = len(data)
start = size // self._num_workers * self._worker_id
if self._worker_id == self._num_workers - 1:
end = size
else:
end = size // self._num_workers * (self._worker_id + 1)
return data[start:end]
return data
@property
def iterator(self):
return self._iterator
[docs] def _parse_audio_element(self, element):
"""Parses tf.data element from TextLineDataset into audio."""
audio_filename = element
if six.PY2:
audio_filename = unicode(audio_filename, "utf-8")
else:
audio_filename = str(audio_filename, "utf-8")
file_path = os.path.join(
self.params["dataset_location"],
audio_filename + ".wav"
)
audio, spectrogram = get_speech_features_from_file(
file_path,
self.params["num_audio_features"],
features_type="mel",
data_min=1e-5,
return_raw_audio=True
)
spectrogram = np.pad(
spectrogram,
((0, 1), (0, 0)),
"constant",
constant_values=1e-5
)
assert len(audio) < len(spectrogram)*256, \
"audio len: {}, spec*256 len: {}".format(len(audio), \
len(spectrogram)*256)
num_pad = len(spectrogram)*256 - len(audio)
audio = np.pad(
audio,
(0, num_pad),
"constant",
constant_values=0
)
# upsample the spectrogram to match source length by repeating each value
spectrogram = np.repeat(spectrogram, 256, axis=0)
return audio.astype(self.params["dtype"].as_numpy_dtype()), \
np.int32([len(audio)]), \
spectrogram.astype(self.params["dtype"].as_numpy_dtype()), \
np.int32([len(spectrogram)])
def _parse_spectrogram_element(self, element):
audio, au_length, spectrogram, spec_length = \
self._parse_audio_element(element)
return spectrogram, spec_length
[docs] def create_interactive_placeholders(self):
self._source = tf.placeholder(
dtype=self.params["dtype"],
shape=[self.params["batch_size"], None]
)
self._src_length = tf.placeholder(
dtype=tf.int32,
shape=[self.params["batch_size"]]
)
self._spec = tf.placeholder(
dtype=self.params["dtype"],
shape=[self.params["batch_size"], None,
self.params["num_audio_features"]]
)
self._spec_length = tf.placeholder(
dtype=tf.int32,
shape=[self.params["batch_size"]]
)
self._spec_offset = tf.placeholder(
dtype=tf.int32,
shape=()
)
self._input_tensors = {}
self._input_tensors["source_tensors"] = [
self._source, self._src_length, self._spec, self._spec_length,
self._spec_offset
]
[docs] def create_feed_dict(self, model_in):
"""
Creates the feed dict for interactive infer using a spectrogram
Args:
model_in: tuple containing source audio, length of the source, \
conditioning spectrogram, length of the spectrogram, index of \
receptive field window
"""
source, src_length, spec, spec_length, spec_offset = model_in
return {
self._source: source,
self._src_length: src_length,
self._spec: spec,
self._spec_length: spec_length,
self._spec_offset: spec_offset
}
[docs] def build_graph(self):
""" builds data reading graph """
self._dataset = tf.data.Dataset.from_tensor_slices(self._files)
if self.params["shuffle"]:
self._dataset = self._dataset.shuffle(self._size)
self._dataset = self._dataset.repeat()
num_audio_features = self.params["num_audio_features"]
if self.params["mode"] != "infer":
self._dataset = self._dataset.map(
lambda line: tf.py_func(
self._parse_audio_element,
[line],
[self.params["dtype"], tf.int32, self.params["dtype"], tf.int32],
stateful=False
),
num_parallel_calls=8
)
self._dataset = self._dataset.padded_batch(
self.params["batch_size"],
padded_shapes=([None], 1, [None, num_audio_features], 1)
)
else:
raise ValueError("Non-interactive infer is not supported")
self._iterator = self._dataset.prefetch(tf.contrib.data.AUTOTUNE) \
.make_initializable_iterator()
if self.params["mode"] != "infer":
source, src_length, spec, spec_length = self._iterator.get_next()
spec.set_shape([self.params["batch_size"], None, num_audio_features])
spec_length = tf.reshape(spec_length, [self.params["batch_size"]])
source.set_shape([self.params["batch_size"], None])
src_length = tf.reshape(src_length, [self.params["batch_size"]])
self._input_tensors = {}
self._input_tensors["source_tensors"] = [
source, src_length, spec, spec_length
]
self._input_tensors["target_tensors"] = [source, src_length]
else:
raise ValueError("Non-interactive infer is not supported")