Source code for sdp.processors.manage_files.convert_audio

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Optional
from sox import Transformer

from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry

from sdp.utils.common import ffmpeg_convert


[docs] class FfmpegConvert(BaseParallelProcessor): """ Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``. .. note:: ``id_key`` can be used to create subdirectories inside ``resampled_audio_dir`` (by using forward slashes ``/``). e.g. if ``id_key`` takes the form ``dir_name1/dir_name2/filename``, the output file path will be ``<resampled_audio_dir>/dir_name1/dirname2/filename.wav``. Args: converted_audio_dir (str): The directory to store the resampled audio files. input_file_key (str): The field in the dataset representing the path to the input video or audio files. output_file_key (str): The field in the dataset representing the path to the resampled audio files with ``output_format``. If ``id_key`` is None, the output file path will be ``<resampled_audio_dir>/<input file name without extension>.wav``. id_key (str): (Optional) The field in the dataset representing the unique ID or identifier for each entry. If ``id_key`` is not None, the output file path will be ``<resampled_audio_dir>/<id_key>.wav``. Defaults to None. output_format (str): (Optional) Format of the output audio files. Defaults to `wav`. target_samplerate (int): (Optional) The target sampling rate for the resampled audio. Defaults to 16000. target_nchannels (int): (Optional) The target number of channels for the resampled audio. Defaults to 1. **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. """ def __init__( self, converted_audio_dir: str, input_file_key: str, output_file_key: str, id_key: str = None, output_format: str = "wav", base_dir: str = None, target_samplerate: int = 16000, target_nchannels: int = 1, **kwargs, ): super().__init__(**kwargs) self.converted_audio_dir = converted_audio_dir self.input_file_key = input_file_key self.output_file_key = output_file_key self.output_format = output_format self.id_key = id_key self.base_dir = base_dir self.target_samplerate = target_samplerate self.target_nchannels = target_nchannels def prepare(self): assert self.output_format == "wav", "Currently only wav format is supported" os.makedirs(self.converted_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): input_file = data_entry[self.input_file_key] if self.id_key: key = data_entry[self.id_key] os.makedirs(os.path.join(self.converted_audio_dir, *key.split("/")[:-1]), exist_ok=True) else: key = os.path.splitext(input_file)[0].split("/")[-1] if self.base_dir: new_dir = os.path.dirname(os.path.relpath(input_file, self.base_dir)) os.makedirs(os.path.join(self.converted_audio_dir, new_dir), exist_ok=True) key = os.path.join(new_dir, key) audio_file = os.path.join(self.converted_audio_dir, key) + "." + self.output_format if not os.path.isfile(audio_file): ffmpeg_convert(input_file, audio_file, self.target_samplerate, self.target_nchannels) data_entry[self.output_file_key] = audio_file return [DataEntry(data=data_entry)]
[docs] class SoxConvert(BaseParallelProcessor): """Processor for Sox to convert audio files to specified format. Args: output_manifest_file (str): Path to the output manifest file. input_audio_file_key (str): Key in the manifest file that contains the path to the input audio file. output_audio_file_key (str): Key in the manifest file that contains the path to the output audio file. converted_audio_dir (str): Path to the directory where the converted audio files will be stored. output_format (str): Format of the output audio file. rate (int): Sample rate of the output audio file. channels (int): Number of channels of the output audio file. workspace_dir (str, Optional): Path to the workspace directory. Defaults to None. """ def __init__( self, converted_audio_dir: str, input_audio_file_key: str = "audio_filepath", output_audio_file_key: str = "audio_filepath", output_format: str = "wav", rate: int = 16000, channels: int = 1, workspace_dir: Optional[str] = None, **kwargs, ): # Extract workspace_dir from kwargs to avoid passing it to BaseProcessor if "workspace_dir" in kwargs: workspace_dir = kwargs.pop("workspace_dir") super().__init__(**kwargs) self.input_audio_file_key = input_audio_file_key self.output_audio_file_key = output_audio_file_key self.converted_audio_dir = converted_audio_dir self.output_format = output_format self.workspace_dir = workspace_dir # Store the new parameters for later use: self.rate = rate self.channels = channels def prepare(self): # Debug print for workspace_dir logger.info(f"SoxConvert workspace_dir: {self.workspace_dir}") os.makedirs(self.converted_audio_dir, exist_ok=True) def process_dataset_entry(self, data_entry): audio_path = data_entry[self.input_audio_file_key] # If workspace_dir is provided, join it with audio_path to get absolute path if self.workspace_dir is not None: full_audio_path = os.path.join(self.workspace_dir, audio_path) else: full_audio_path = audio_path # Debug print first file path if not hasattr(self, '_debug_printed'): logger.info(f"First audio_path from manifest: {audio_path}") logger.info(f"First full_audio_path: {full_audio_path}") logger.info(f"Path exists: {os.path.exists(full_audio_path)}") self._debug_printed = True key = os.path.splitext(audio_path)[0].split("/")[-1] converted_file = os.path.join(self.converted_audio_dir, key) + f".{self.output_format}" if not os.path.isfile(converted_file): transformer = Transformer() transformer.rate(self.rate) transformer.channels(self.channels) transformer.build(full_audio_path, converted_file) data_entry[self.output_audio_file_key] = converted_file return [DataEntry(data=data_entry)]