Source code for sdp.processors.datasets.ytc.create_initial_manifest

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os
import subprocess

from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import load_manifest

[docs] class CreateInitialManifestYTC(BaseParallelProcessor): """A processor class for creating initial manifest files for a TTS dataset. It takes a manifest file containing audio file paths and resamples them to a target sample rate and format, while creating a new manifest file with the updated paths. Args: input_format (str): Format of the input audio files resampled_audio_dir (str): Directory where resampled audio files will be saved target_sample_rate (int): Desired sample rate for the output audio files target_format (str): Desired format for the output audio files target_nchannels (int): Desired number of channels for the output audio files Returns: The same data as in the input manifest, but with resampled audio files and updated audio file paths. Example: .. code-block:: yaml - _target_: sdp.processors.datasets.ytc.create_initial_manifest.CreateInitialManifestYTC input_manifest_file: ${workspace_dir}/manifest.json output_manifest_file: ${workspace_dir}/manifest_resampled.json """ def __init__( self, input_format: str, resampled_audio_dir: str, target_sample_rate: int, target_format: str, target_nchannels: int, **kwargs ): super().__init__(**kwargs) self.input_format = input_format self.resampled_audio_dir = resampled_audio_dir self.target_sample_rate = target_sample_rate self.target_format = target_format self.target_nchannels = target_nchannels def prepare(self): """Creates the output directory for resampled audio files if it doesn't exist.""" os.makedirs(self.resampled_audio_dir, exist_ok=True) def read_manifest(self): """ Reads metadata from JSONL file in the input manifest Returns: list: A list of dataset entries parsed from the JSONL manifest file """ dataset_entries = load_manifest(self.input_manifest_file, encoding="utf8") return dataset_entries def process_dataset_entry(self, metadata: DataEntry): """Processes a single dataset entry by resampling the audio file and updating metadata. Args: metadata (DataEntry): The metadata entry containing information about the audio file Returns: list[DataEntry]: A list containing the processed DataEntry with updated metadata Note: This method: 1. Resamples the audio file to the target format and sample rate if needed 2. Updates the metadata with new file paths and duration 3. Uses either sox or ffmpeg for audio conversion depending on input format """ import soundfile as sf input_audio_path = metadata['audio_filepath'] output_audio_path = os.path.join(self.resampled_audio_dir, metadata['audio_item_id'] + '.' + self.target_format) # Convert audio file to target sample rate and format if not os.path.exists(output_audio_path): if input_audio_path.lower().endswith(".wav"): cmd = f'sox --no-dither -V1 "{input_audio_path}" -r {self.target_sample_rate} -c 1 -b 16 "{output_audio_path}"' else: cmd = f'ffmpeg -i "{input_audio_path}" -ar {self.target_sample_rate} -ac 1 -ab 16 "{output_audio_path}" -v error' try: subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) # Ensures output is in string formats) except subprocess.CalledProcessError as e: print("Exception occurred while converting audio file: ", e, e.stderr) print(f'Error converting {input_audio_path} to {output_audio_path}. Hence skipping this entry.') exit(1) metadata['audio_filepath'] = input_audio_path metadata['resampled_audio_filepath'] = output_audio_path try: metadata['duration'] = sf.info(output_audio_path).duration except Exception as e: print(f'Error getting duration of {output_audio_path}. Hence not adding duration to metadata.') return [DataEntry(data=metadata)]