# Copyright (c) 2018 NVIDIA Corporation
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
import numpy as np
import librosa
import librosa.filters
import resampy as rs
[docs]def get_speech_features_from_file(
filename,
num_features,
features_type='magnitude',
n_fft=1024,
hop_length=None,
mag_power=2,
feature_normalize=False,
mean=0.,
std=1.,
trim=False,
data_min=1e-5,
return_raw_audio=False,
return_audio_duration=False,
augmentation=None,
mel_basis=None
):
""" Helper function to retrieve spectrograms from wav files
Args:
filename (string): WAVE filename.
num_features (int): number of speech features in frequency domain.
features_type (string): 'magnitude' or 'mel'.
n_fft (int): size of analysis window in samples.
hop_length (int): stride of analysis window in samples.
mag_power (int): power to raise magnitude spectrograms (prior to dot product
with mel basis)
1 for energy spectrograms
2 fot power spectrograms
feature_normalize (bool): whether to normalize the data with mean and std
mean (float): if normalize is enabled, the mean to normalize to
std (float): if normalize is enabled, the deviation to normalize to
trim (bool): Whether to trim silence via librosa or not
data_min (float): min clip value prior to taking the log.
Returns:
np.array: np.array of audio features with shape=[num_time_steps,
num_features].
"""
# load audio signal
signal, fs = librosa.core.load(filename, sr=None)
if hop_length is None:
hop_length = int(n_fft / 4)
if trim:
signal, _ = librosa.effects.trim(
signal,
frame_length=int(n_fft/2),
hop_length=int(hop_length/2)
)
if augmentation is not None:
if 'pitch_shift_steps' in augmentation:
pitch_shift_steps = (2.0 * augmentation['pitch_shift_steps'] * \
np.random.rand()) - augmentation['pitch_shift_steps']
signal = librosa.effects.pitch_shift(signal, fs, pitch_shift_steps)
if augmentation['time_stretch_ratio'] > 0:
# time stretch
stretch_amount = 1.0 + (2.0 * np.random.rand() - 1.0) * \
augmentation['time_stretch_ratio']
signal = rs.resample(
signal,
fs,
int(fs * stretch_amount),
filter='kaiser_fast',
)
# noise
noise_level_db = np.random.randint(
low=augmentation['noise_level_min'],
high=augmentation['noise_level_max']
)
signal += np.random.randn(signal.shape[0]) * \
10.0 ** (noise_level_db / 20.0)
speech_features = get_speech_features(
signal, fs, num_features, features_type, n_fft,
hop_length, mag_power, feature_normalize, mean, std, data_min, mel_basis
)
if return_raw_audio:
return signal, speech_features
elif return_audio_duration:
return speech_features, len(signal) * 1.0 / fs
else:
return speech_features
[docs]def get_speech_features(
signal,
fs,
num_features,
features_type='magnitude',
n_fft=1024,
hop_length=256,
mag_power=2,
feature_normalize=False,
mean=0.,
std=1.,
data_min=1e-5,
mel_basis=None
):
""" Helper function to retrieve spectrograms from loaded wav
Args:
signal: signal loaded with librosa.
fs (int): sampling frequency in Hz.
num_features (int): number of speech features in frequency domain.
features_type (string): 'magnitude' or 'mel'.
n_fft (int): size of analysis window in samples.
hop_length (int): stride of analysis window in samples.
mag_power (int): power to raise magnitude spectrograms (prior to dot product
with mel basis)
1 for energy spectrograms
2 fot power spectrograms
feature_normalize(bool): whether to normalize the data with mean and std
mean(float): if normalize is enabled, the mean to normalize to
std(float): if normalize is enabled, the deviation to normalize to
data_min (float): min clip value prior to taking the log.
Returns:
np.array: np.array of audio features with shape=[num_time_steps,
num_features].
"""
if isinstance(data_min, dict):
data_min_mel = data_min["mel"]
data_min_mag = data_min["magnitude"]
else:
data_min_mel = data_min_mag = data_min
if isinstance(num_features, dict):
num_features_mel = num_features["mel"]
num_features_mag = num_features["magnitude"]
else:
num_features_mel = num_features_mag = num_features
complex_spec = librosa.stft(y=signal, n_fft=n_fft)
mag, _ = librosa.magphase(complex_spec, power=mag_power)
if features_type == 'magnitude' or features_type == "both":
features = np.log(np.clip(mag, a_min=data_min_mag, a_max=None)).T
assert num_features_mag <= n_fft // 2 + 1, \
"num_features for spectrogram should be <= (fs * window_size // 2 + 1)"
# cut high frequency part
features = features[:, :num_features_mag]
if 'mel' in features_type or features_type == "both":
if features_type == "both":
mag_features = features
if mel_basis is None:
htk = True
norm = None
if 'slaney' in features_type:
htk = False
norm = 1
mel_basis = librosa.filters.mel(
sr=fs,
n_fft=n_fft,
n_mels=num_features_mel,
htk=htk,
norm=norm
)
features = np.dot(mel_basis, mag)
features = np.log(np.clip(features, a_min=data_min_mel, a_max=None)).T
if feature_normalize:
features = normalize(features, mean, std)
if features_type == "both":
return [features, mag_features]
return features
[docs]def get_mel(
log_mag_spec,
fs=22050,
n_fft=1024,
n_mels=80,
power=2.,
feature_normalize=False,
mean=0,
std=1,
mel_basis=None,
data_min=1e-5,
htk=True,
norm=None
):
"""
Method to get mel spectrograms from magnitude spectrograms
Args:
log_mag_spec (np.array): log of the magnitude spec
fs (int): sampling frequency in Hz
n_fft (int): size of fft window in samples
n_mels (int): number of mel features
power (float): power of the mag spectrogram
feature_normalize (bool): whether the mag spec was normalized
mean (float): normalization param of mag spec
std (float): normalization param of mag spec
mel_basis (np.array): optional pre-computed mel basis to save computational
time if passed. If not passed, it will call librosa to construct one
data_min (float): min clip value prior to taking the log.
htk (bool): whther to compute the mel spec with the htk or slaney algorithm
norm: Should be None for htk, and 1 for slaney
Returns:
np.array: mel_spec with shape [time, n_mels]
"""
if mel_basis is None:
mel_basis = librosa.filters.mel(
fs,
n_fft,
n_mels=n_mels,
htk=htk,
norm=norm
)
log_mag_spec = log_mag_spec * power
mag_spec = np.exp(log_mag_spec)
mel_spec = np.dot(mag_spec, mel_basis.T)
mel_spec = np.log(np.clip(mel_spec, a_min=data_min, a_max=None))
if feature_normalize:
mel_spec = normalize(mel_spec, mean, std)
return mel_spec
[docs]def inverse_mel(
log_mel_spec,
fs=22050,
n_fft=1024,
n_mels=80,
power=2.,
feature_normalize=False,
mean=0,
std=1,
mel_basis=None,
htk=True,
norm=None
):
"""
Reconstructs magnitude spectrogram from a mel spectrogram by multiplying it
with the transposed mel basis.
Args:
log_mel_spec (np.array): log of the mel spec
fs (int): sampling frequency in Hz
n_fft (int): size of fft window in samples
n_mels (int): number of mel features
power (float): power of the mag spectrogram that was used to generate the
mel spec
feature_normalize (bool): whether the mel spec was normalized
mean (float): normalization param of mel spec
std (float): normalization param of mel spec
mel_basis (np.array): optional pre-computed mel basis to save computational
time if passed. If not passed, it will call librosa to construct one
htk (bool): whther to compute the mel spec with the htk or slaney algorithm
norm: Should be None for htk, and 1 for slaney
Returns:
np.array: mag_spec with shape [time, n_fft/2 + 1]
"""
if mel_basis is None:
mel_basis = librosa.filters.mel(
fs,
n_fft,
n_mels=n_mels,
htk=htk,
norm=norm
)
if feature_normalize:
log_mel_spec = denormalize(log_mel_spec, mean, std)
mel_spec = np.exp(log_mel_spec)
mag_spec = np.dot(mel_spec, mel_basis)
mag_spec = np.power(mag_spec, 1. / power)
return mag_spec
[docs]def normalize(features, mean, std):
"""
Normalizes features with the specificed mean and std
"""
return (features - mean) / std
[docs]def denormalize(features, mean, std):
"""
Normalizes features with the specificed mean and std
"""
return features * std + mean