Source code for sdp.processors.tts.metrics

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import librosa
import math
import numpy as np
from tqdm import tqdm

from sdp.logging import logger
from sdp.processors.base_processor import BaseProcessor
from sdp.utils.common import load_manifest, save_manifest

import torch
import torchaudio
import torchaudio.functional as F
from torchaudio.pipelines import SQUIM_OBJECTIVE



[docs]
class TorchSquimObjectiveQualityMetricsProcessor(BaseProcessor):
    """This processor calculates Squim quality metrics for audio files.

    It uses a pre-trained Squim model to calculate audio quality metrics like PESQ, STOI
    and SI-SDR for each audio segment in the manifest:

        PESQ (Perceptual Evaluation of Speech Quality)
        A measure of overall quality for speech (originally designed to detect codec distortions but highly correlated to all kinds of distortion.
        
        STOI (Short-Time Objective Intelligibility)
        A measure of speech intelligibility, basically measures speech envelope integrity. 
        A STOI value of 1.0 means 100% of the speech being evaluated is intelligible on average.

        SI-SDR (Scale-Invariant Signal-to-Distortion Ratio)
        A measure of how strong the speech signal is vs. all the distortion present in the audio, in decibels. 
        0 dB means the energies of speech and distortion are the same. A value between 15-20 dB is what is considered "clean enough" speech in general.

    Args:
        device (str, Optional): Device to run the model on. Defaults to "cuda".

    Returns:
        The same data as in the input manifest, but with quality metrics added to each
        segment's metrics field.

    Example:
        .. code-block:: yaml

            - _target_: sdp.processors.tts.metrics.TorchSquimObjectiveQualityMetricsProcessor
              input_manifest_file: ${workspace_dir}/manifest.json
              output_manifest_file: ${workspace_dir}/manifest_squim.json
    """
    def __init__(self, device: str = "cuda", **kwargs):
        super().__init__(**kwargs)

        if not torch.cuda.is_available():
            device="cpu"
            logger.warning("CUDA is not available, using CPU")
        
        if device == "cuda":
            self.model = SQUIM_OBJECTIVE.get_model().cuda()
        else:
            self.model = SQUIM_OBJECTIVE.get_model()

    def process(self):
        manifest = load_manifest(self.input_manifest_file)

        results = []

        for metadata in tqdm(manifest):
            info = torchaudio.info(metadata['resampled_audio_filepath'])
            sr = info.sample_rate

            try: 
                audio, _ = librosa.load(path=metadata['resampled_audio_filepath'], sr=sr)
            except Exception as ex:
                logger.info(f"Failed to load {metadata['resampled_audio_filepath']}, exception={ex}")
                continue

            for segment in metadata["segments"]:
                if ("text" in segment and segment["text"].strip() == "") or (segment["speaker"]=="no-speaker"):
                    continue
                start = segment["start"]
                end = segment["end"]

                start = math.floor(start * sr)
                end = math.floor(end * sr)
                num_samples = end - start

                y = audio[start: end]
                y = torch.from_numpy(y)
                y = torch.unsqueeze(y, dim=0) # needed otherwise throws input size error

                if sr != 16000:
                    y = F.resample(y, sr, 16000)

                try:
                    with torch.no_grad():
                        y_cuda = y.cuda()
                        stoi_hyp, pesq_hyp, si_sdr_hyp = self.model(y_cuda)

                    if 'metrics' in segment:
                        metrics = segment['metrics']
                    else:
                        metrics = {}

                    pesq = pesq_hyp.item()
                    stoi = stoi_hyp.item()
                    si_sdr = si_sdr_hyp.item()

                    metrics['pesq_squim'] = round(pesq, 3)
                    metrics['stoi_squim'] = round(stoi, 3)
                    metrics['sisdr_squim'] = round(si_sdr, 3)
                    segment['metrics'] = metrics
                except Exception as e:
                    torch.cuda.empty_cache()
                    logger.info('Failed to extract Squim metrics {} with frame_offset={} and num_frames={}'.format(
                        metadata['resampled_audio_filepath'],
                        start,
                        num_samples))
                    continue
            results.append(metadata)

        save_manifest(results, self.output_manifest_file)




[docs]
class BandwidthEstimationProcessor(BaseProcessor):
    """This processor estimates audio bandwidth by analyzing power spectra.

    It analyzes audio files to estimate their effective bandwidth by examining the power
    spectrum and determining the highest frequency with significant energy content above
    a threshold.

    Args:
        n_fft (int, Optional): Size of FFT window. Defaults to 512
        stride_seconds (float, Optional): Time between successive FFT windows in seconds. Defaults to 0.01
        top_db (float, Optional): Maximum decibel value for power spectrum normalization. Defaults to 100.0
        frequency_threshold (float, Optional): Threshold in dB below peak for bandwidth estimation. Defaults to -50.0

    Returns:
        The same data as in the input manifest, but with bandwidth estimates added
        to each segment.

    Example:
        .. code-block:: yaml

            - _target_: sdp.processors.tts.metrics.BandwidthEstimationProcessor
              input_manifest_file: ${workspace_dir}/manifest.json
              output_manifest_file: ${workspace_dir}/manifest_with_bandwidth.json
    """
    def __init__(
        self,
        n_fft: int = 512,
        stride_seconds: float = 0.01,
        top_db: float = 100.0,
        frequency_threshold: float = -50.0,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.n_fft = n_fft
        self.stride_seconds = stride_seconds
        self.top_db = top_db
        self.frequency_threshold = frequency_threshold

    def _estimate_bandwidth(self, audio, sample_rate):
        """Estimates the bandwidth of an audio signal.
        
        This method calculates the power spectrogram of the audio signal and determines
        the bandwidth based on a frequency threshold.
        
        Args:
            audio (np.ndarray): The audio signal to estimate the bandwidth of
            sample_rate (int): The sample rate of the audio signal
            
        Returns:
            int: The estimated bandwidth of the audio signal
        """
        hop_length = int(sample_rate * self.stride_seconds)
        # calculate power spectrogram
        # use Blackman-Harris window to significantly reduce spectral leakage (level of sidelobes)
        spec = librosa.stft(y=audio, n_fft=self.n_fft, hop_length=hop_length, window="blackmanharris")
        power_spec = np.abs(spec) ** 2
        power_spec = np.mean(power_spec, axis=1)
        power_spec = librosa.power_to_db(power_spec, ref=self.n_fft, top_db=self.top_db)

        bandwidth = 0
        peak = np.max(power_spec)
        freq_width = sample_rate / self.n_fft
        for idx in range(len(power_spec) - 1, -1, -1):
            if power_spec[idx] - peak > self.frequency_threshold:
                bandwidth = idx * freq_width
                break

        return bandwidth

    def process(self):
        manifest = load_manifest(self.input_manifest_file)

        results = []

        for metadata in tqdm(manifest):
            audio_filepath = metadata['audio_filepath']
            try: 
                audio, sample_rate = librosa.load(path=audio_filepath, sr=None)
            except Exception as ex:
                logger.info(f"Failed to load {audio_filepath}, exception={ex}")
                continue

            for segment in metadata['segments']:
                if ("text" in segment and segment["text"].strip() == "") or (segment["speaker"]=="no-speaker"):
                    continue
                start = segment['start']
                end = segment['end']

                audio_segment = audio[int(start*sample_rate): int(end*sample_rate)]

                bandwidth = self._estimate_bandwidth(audio=audio_segment, sample_rate=sample_rate)

                if 'metrics' in segment:
                    metrics = segment['metrics']
                else:
                    metrics = {}

                metrics['bandwidth'] = int(bandwidth)
                segment['metrics'] = metrics

            results.append(metadata)

        save_manifest(results, self.output_manifest_file)