Source code for data.speech2text.speech_commands

import os
import six
import numpy as np
import tensorflow as tf
import pandas as pd
import librosa

from open_seq2seq.data.data_layer import DataLayer
from open_seq2seq.data.text2speech.speech_utils import \
  get_speech_features_from_file

[docs]class SpeechCommandsDataLayer(DataLayer):
[docs] @staticmethod def get_required_params(): return dict(DataLayer.get_required_params(), ** { "dataset_files": list, "dataset_location": str, "num_audio_features": int, "audio_length": int, "num_labels": int, "model_format": str })
[docs] @staticmethod def get_optional_params(): return dict(DataLayer.get_optional_params(), **{ "cache_data": bool, "augment_data": bool })
[docs] def split_data(self, data): if self.params["mode"] != "train" and self._num_workers is not None: size = len(data) start = size // self._num_workers * self._worker_id if self._worker_id == self._num_workers - 1: end = size else: end = size // self._num_workers * (self._worker_id + 1) return data[start:end] return data
@property def input_tensors(self): return self._input_tensors @property def iterator(self): return self._iterator
[docs] def get_size_in_samples(self): if self._files is not None: return len(self._files) else: return 0
[docs] def __init__(self, params, model, num_workers=None, worker_id=None): """ ResNet Speech Commands data layer constructor. Config parameters: * **dataset_files** (list) --- list with paths to all dataset .csv files * **dataset_location** (str) --- string with path to directory where .wavs are stored * **num_audio_features** (int) --- number of spectrogram audio features and image length * **audio_length** (int) --- cropping length of spectrogram and image width * **num_labels** (int) --- number of classes in dataset * **model_format** (str) --- determines input format, should be one of "jasper" or "resnet" * **cache_data** (bool) --- cache the training data in the first epoch * **augment_data** (bool) --- add time stretch and noise to training data """ super(SpeechCommandsDataLayer, self).__init__(params, model, num_workers, worker_id) if self.params["mode"] == "infer": raise ValueError("Inference is not supported on SpeechCommandsDataLayer") self._files = None for file in self.params["dataset_files"]: csv_file = pd.read_csv( os.path.join(self.params["dataset_location"], file), encoding="utf-8", sep=",", header=None, names=["label", "wav_filename"], dtype=str ) if self._files is None: self._files = csv_file else: self._files.append(csv_file) cols = ["label", "wav_filename"] if self._files is not None: all_files = self._files.loc[:, cols].values self._files = self.split_data(all_files) self._size = self.get_size_in_samples() self._iterator = None self._input_tensors = None
[docs] def preprocess_image(self, image): """Crops or pads a spectrogram into a fixed dimension square image """ num_audio_features = self.params["num_audio_features"] audio_length = self.params["audio_length"] if image.shape[0] > audio_length: # randomly slice offset = np.random.randint(0, image.shape[0] - audio_length + 1) image = image[offset:offset + audio_length, :] else: # symmetrically pad with zeros pad_left = (audio_length - image.shape[0]) // 2 pad_right = (audio_length - image.shape[0]) // 2 if (audio_length - image.shape[0]) % 2 == 1: pad_right += 1 image = np.pad( image, ((pad_left, pad_right), (0, 0)), "constant" ) assert image.shape == (audio_length, num_audio_features) # add dummy dimension if self.params["model_format"] == "jasper": # for batch norm image = np.expand_dims(image, 1) else: # for channel image = np.expand_dims(image, -1) return image
[docs] def parse_element(self, element): """Reads an audio file and returns the augmented spectrogram image """ label, audio_filename = element if six.PY2: audio_filename = unicode(audio_filename, "utf-8") else: audio_filename = str(audio_filename, "utf-8") file_path = os.path.join( self.params["dataset_location"], audio_filename ) if self.params["mode"] == "train" and self.params.get("augment_data", False): augmentation = { "pitch_shift_steps": 2, "time_stretch_ratio": 0.2, "noise_level_min": -90, "noise_level_max": -46, } else: augmentation = None spectrogram = get_speech_features_from_file( file_path, self.params["num_audio_features"], features_type="mel", data_min=1e-5, augmentation=augmentation ) image = self.preprocess_image(spectrogram) return image.astype(self.params["dtype"].as_numpy_dtype()), \ np.int32(self.params["num_audio_features"]), np.int32(label)
[docs] def build_graph(self): dataset = tf.data.Dataset.from_tensor_slices(self._files) cache_data = self.params.get("cache_data", False) if not cache_data: if self.params["shuffle"]: dataset = dataset.shuffle(self._size) dataset = dataset.map( lambda line: tf.py_func( self.parse_element, [line], [self.params["dtype"], tf.int32, tf.int32], stateful=False ), num_parallel_calls=8 ) if cache_data: dataset = dataset.cache() if self.params["shuffle"]: dataset = dataset.shuffle(self._size) if self.params["repeat"]: dataset = dataset.repeat() dataset = dataset.batch(self.params["batch_size"]) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) self._iterator = dataset.make_initializable_iterator() inputs, lengths, labels = self._iterator.get_next() if self.params["model_format"] == "jasper": inputs.set_shape([ self.params["batch_size"], self.params["audio_length"], 1, self.params["num_audio_features"], ]) # B T 1 C lengths.set_shape([self.params["batch_size"]]) source_tensors = [inputs, lengths] else: inputs.set_shape([ self.params["batch_size"], self.params["num_audio_features"], self.params["num_audio_features"], 1 ]) # B W L C source_tensors = [inputs] labels = tf.one_hot(labels, self.params["num_labels"]) labels.set_shape([self.params["batch_size"], self.params["num_labels"]]) self._input_tensors = { "source_tensors": source_tensors, "target_tensors": [labels] }