Source code for sdp.processors.tts.split

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from sdp.processors.base_processor import BaseProcessor, DataEntry
import json
import os
import torchaudio
import math
from copy import deepcopy
from sdp.utils.common import load_manifest, save_manifest


[docs]
class SplitLongAudio(BaseProcessor):
    """This processor splits long audio files into smaller segments.

    It processes audio files that exceed a specified maximum length by splitting them into
    smaller segments at natural pauses in the audio to maintain speech coherence.

    Args:
        suggested_max_len (float): Target maximum length for audio segments in seconds. Defaults to 3600
        min_pause_len (float): Minimum length of a pause to consider for splitting in seconds. Defaults to 1.0
        min_len (float): Minimum length for any split segment in seconds. Defaults to 1.0

    Returns:
        The same data as in the input manifest, but with long audio files split into
        multiple segments with updated paths and durations.

    Example:
        .. code-block:: yaml

            - _target_: sdp.processors.tts.split.SplitLongAudio
              input_manifest_file: ${workspace_dir}/manifest.json
              output_manifest_file: ${workspace_dir}/manifest_split.json
              suggested_max_len: 3600
    """
    def __init__(self,
                 suggested_max_len: float = 3600,
                 min_pause_len: float = 1.0,
                 min_len: float = 1.0,
                 **kwargs
        ):
        super().__init__(**kwargs)
        self.suggested_max_len = suggested_max_len
        self.min_pause_len = min_pause_len
        self.min_len = min_len

    def process(self):
        """Process the input manifest to split long audio files into smaller segments.

        This method:
        1. Reads the input manifest
        2. For each audio file longer than suggested_max_len:
            - Identifies suitable pause points for splitting
            - Creates new audio files for each segment
            - Updates metadata for each split segment
        3. Saves the results to the output manifest

        The output manifest includes:
        - Original entries for audio files shorter than suggested_max_len
        - Split entries with updated paths and durations
        - Meta-entries containing split information for later joining
        """
        manifest = load_manifest(self.input_manifest_file)

        results = []
        for metadata in manifest:
            if metadata['duration'] < self.suggested_max_len:
                metadata['split_filepaths'] = None
                results.append(metadata)
                continue
            splits = []
            split_start = 0
            prev_end = 0
            for segment in metadata['segments']:
                start = segment['start']
                end = segment['end']

                # Calculate duration of the current turn
                turn_duration = end - start

                # Calculate pause duration
                pause_duration = start - prev_end

                if end - split_start > self.suggested_max_len:
                    # Add the timestamp of the pause to the list
                    splits.append(prev_end)
                    split_start = prev_end

                # Update previous start and end times
                prev_end = end
            metadata['split_timestamps'] = splits
            # Now that we have all splits, generate splitted wav files
            audio, sr = torchaudio.load(metadata['resampled_audio_filepath'])
            path, filename = os.path.split(metadata['resampled_audio_filepath'])
            split_start = 0
            split_filepaths = []
            actual_splits = []
            split_durations = []
            for k, split in enumerate(splits):
                split_filepath = os.path.join(path, filename[:-4] + '.{}_of_{}.wav'.format(k+1, 1+len(splits)))
                split_end = math.ceil(split*sr)
                if split_end-split_start > self.min_len * sr:
                    torchaudio.save(split_filepath, audio[:, split_start:split_end], sr)
                    split_filepaths.append(split_filepath)
                    actual_splits.append(split_start/sr)
                    split_durations.append((split_end-split_start)/sr)
                    split_start = split_end

            # Write last split
            split_filepath = os.path.join(path, filename[:-4] + '.{}_of_{}.wav'.format(1+len(splits), 1+len(splits)))
            last_frame = len(audio[0])-1
            # skip audios that are too short
            if last_frame-split_start > self.min_len * sr and last_frame-split_start < (self.suggested_max_len + 1)*sr:
                torchaudio.save(split_filepath, audio[:, split_start:], sr)
                split_filepaths.append(split_filepath)
                split_durations.append((last_frame-split_start)/sr)
                actual_splits.append(split_start/sr)

            # Add split_filepaths to results without split_filepaths field and resampled_audio_filepath replaced
            # with the corresponding splits
            for idx, split in enumerate(split_filepaths):
                split_metadata = deepcopy(metadata)
                split_metadata['resampled_audio_filepath'] = split
                split_metadata['duration'] = split_durations[idx]
                results.append(split_metadata)
            # We keep an entry with 'split_filepaths' in it as a meta-entry
            # to be used when joining metadatas together
            assert len(split_filepaths) == len(actual_splits)
            metadata['split_filepaths'] = split_filepaths
            metadata['split_offsets'] = actual_splits
            results.append(metadata)

        save_manifest(results, self.output_manifest_file)




[docs]
class JoinSplitAudioMetadata(BaseProcessor):
    """A processor for joining metadata of previously split audio files.

    This processor combines the metadata (transcripts and alignments) of audio files 
    that were previously split by the SplitLongAudio processor. It adjusts timestamps
    and concatenates transcripts to recreate the original audio's metadata.

    Args:
        None

    Returns:
        The same data as in the input manifest, but with split audio files joined together.
    """
    def __init__(self,
                 **kwargs
        ):
        super().__init__(**kwargs)

    def process(self):
        """Process the input manifest to join metadata of split audio files.

        This method:
        1. Reads the input manifest
        2. Identifies meta-entries containing split information
        3. For each meta-entry:
            - Concatenates transcripts from all splits
            - Adjusts alignment timestamps based on split offsets
            - Creates a single combined metadata entry
        4. Saves the results to the output manifest

        The output manifest contains:
        - Original entries for unsplit audio files
        - Combined entries for previously split audio files
        """
        manifest = load_manifest(self.input_manifest_file)

        fp_w = open(self.output_manifest_file, 'w')

        meta_entries = []
        for metadata in manifest:
            if 'split_filepaths' in metadata:
                meta_entries.append(metadata)

        for meta_entry in meta_entries:
            # Find all parts
            transcripts = []
            alignments = []
            if meta_entry['split_filepaths'] is None:
                del meta_entry['split_filepaths']
                fp_w.write(f"{json.dumps(meta_entry)}\n")
                continue
            for idx, split in enumerate(meta_entry['split_filepaths']):
                entry = next(filter(lambda x: x['resampled_audio_filepath'] == split, manifest))
                transcripts.append(entry['text'])
                alignment = entry['alignment']
                for word in alignment:
                    word['start'] += meta_entry['split_offsets'][idx]
                    word['end'] += meta_entry['split_offsets'][idx]
                alignments += alignment
            # Concatenate transcripts and alignment together
            meta_entry['text'] = ' '.join(transcripts)
            meta_entry['alignment'] = alignments

            # Remove 'split_filepaths' field from meta entry to turn it into a real entry
            del meta_entry['split_filepaths']
            fp_w.write(f"{json.dumps(meta_entry)}\n")
        
        fp_w.close()