Source code for sdp.processors.datasets.ytc.create_initial_manifest

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os
import subprocess

from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import load_manifest


[docs]
class CreateInitialManifestYTC(BaseParallelProcessor):
    """A processor class for creating initial manifest files for a TTS dataset.
    
    It takes a manifest file containing audio file paths and resamples them to a target
    sample rate and format, while creating a new manifest file with the updated paths.

    Args:
        input_format (str): Format of the input audio files
        resampled_audio_dir (str): Directory where resampled audio files will be saved
        target_sample_rate (int): Desired sample rate for the output audio files
        target_format (str): Desired format for the output audio files
        target_nchannels (int): Desired number of channels for the output audio files

    Returns:
        The same data as in the input manifest, but with resampled audio files and
        updated audio file paths.

    Example:
        .. code-block:: yaml

            - _target_: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC
              input_manifest_file: ${workspace_dir}/manifest.json
              output_manifest_file: ${workspace_dir}/manifest_resampled.json
    """
    def __init__(
            self,
            input_format: str,
            resampled_audio_dir: str,
            target_sample_rate: int,
            target_format: str,
            target_nchannels: int,
            **kwargs
        ):
        super().__init__(**kwargs)
        self.input_format = input_format
        self.resampled_audio_dir = resampled_audio_dir
        self.target_sample_rate = target_sample_rate
        self.target_format = target_format
        self.target_nchannels = target_nchannels

    def prepare(self):
        """Creates the output directory for resampled audio files if it doesn't exist."""
        os.makedirs(self.resampled_audio_dir, exist_ok=True)

    def read_manifest(self):
        """ Reads metadata from JSONL file in the input manifest
          Returns:
            list: A list of dataset entries parsed from the JSONL manifest file
        """
        dataset_entries = load_manifest(self.input_manifest_file, encoding="utf8")

        return dataset_entries

    def process_dataset_entry(self, metadata: DataEntry):
        """Processes a single dataset entry by resampling the audio file and updating metadata.
        
        Args:
            metadata (DataEntry): The metadata entry containing information about the audio file
            
        Returns:
            list[DataEntry]: A list containing the processed DataEntry with updated metadata
            
        Note:
            This method:
            1. Resamples the audio file to the target format and sample rate if needed
            2. Updates the metadata with new file paths and duration
            3. Uses either sox or ffmpeg for audio conversion depending on input format
        """
        import soundfile as sf
        input_audio_path = metadata['audio_filepath']
        output_audio_path = os.path.join(self.resampled_audio_dir, metadata['audio_item_id'] + '.' + self.target_format)

        # Convert audio file to target sample rate and format
        if not os.path.exists(output_audio_path):
            if input_audio_path.lower().endswith(".wav"):
                cmd = f'sox --no-dither -V1 "{input_audio_path}" -r {self.target_sample_rate} -c 1 -b 16 "{output_audio_path}"'
            else:
                cmd = f'ffmpeg -i  "{input_audio_path}" -ar {self.target_sample_rate} -ac 1 -ab 16 "{output_audio_path}" -v error'
            try:
                subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               text=True)  # Ensures output is in string formats)
            except subprocess.CalledProcessError as e:
                print("Exception occurred while converting audio file: ", e, e.stderr)
                print(f'Error converting {input_audio_path} to {output_audio_path}. Hence skipping this entry.')
                exit(1)
        
        metadata['audio_filepath'] = input_audio_path
        metadata['resampled_audio_filepath'] = output_audio_path
        try:
            metadata['duration'] = sf.info(output_audio_path).duration
        except Exception as e:
            print(f'Error getting duration of {output_audio_path}. Hence not adding duration to metadata.')
        
        return [DataEntry(data=metadata)]