Source code for sdp.processors.datasets.hifitts2.download_dataset

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import librosa
from pathlib import Path
import soundfile as sf
import time
import urllib.error
import urllib.request

from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry


[docs] class DownloadHiFiTTS2(BaseParallelProcessor): """ Downloads HiFiTTS-2 dataset to local machine. Unsegmented audiobook chapters are first downloaded at a 48 kHz from LibriVox. Each chapter is then split into segmented utterance files based on precomputed offsets and durations. To reduce disk use, the chapter files can be optionally deleted after they are segmented. Metadata for chapters which fail to download due to network errors are stored in an output manifest file, which can be given as input to this processor to attempt the downloads again. Args: audio_dir (str): Root directory where utterance files will be saved. chapter_dir (str): Root directory where audiobook chapter files will be saved. sample_rate (int): Sample rate to use for utterance files. delete_chapter_files (bool): Whether to delete each chapter file after it is done being processed. exit_on_error (bool): Whether to terminate the entire processor script if a single chapter downlaod fails. num_retries (int): Number of times to retry chapter download after encountering intermittent HTTP errors. Returns: Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'. If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod, with error information stored under the 'error_code' and 'error_reason' fields. Example: .. code-block:: yaml - _target_: sdp.processors.DownloadHiFiTTS2 input_manifest_file: ${workspace_dir}/manifest_22khz.json output_manifest_file: ${workspace_dir}/errors_22khz.json audio_dir: ${workspace_dir}/audio_22khz chapter_dir: ${workspace_dir}/chapters max_workers: 8 """ def __init__( self, audio_dir: str, chapter_dir: str, sample_rate: int = 22050, delete_chapter_files: bool = True, exit_on_error: bool = False, num_retries: int = 5, **kwargs, ): super().__init__(**kwargs) self.audio_dir = Path(audio_dir) self.chapter_dir = Path(chapter_dir) self.sample_rate = sample_rate self.delete_chapter_files = delete_chapter_files self.exit_on_error = exit_on_error self.num_retries = num_retries def prepare(self): # Create output directory structure with open(self.input_manifest_file, "rt", encoding="utf-8") as fin: dirs = set() for line in fin: row = json.loads(line) audio_filepath = Path(row["utterances"][0]["audio_filepath"]) chapter_dir = audio_filepath.parent dirs.add(chapter_dir) for dir in dirs: audio_dir = self.audio_dir / dir chapter_dir = self.chapter_dir / dir audio_dir.mkdir(exist_ok=True, parents=True) chapter_dir.mkdir(exist_ok=True, parents=True) return def process_dataset_entry(self, data_entry): url = data_entry["url"] chapter_filepath = data_entry["chapter_filepath"] utterances = data_entry["utterances"] chapter_path = self.chapter_dir / chapter_filepath for i in range(1, self.num_retries + 1): try: urllib.request.urlretrieve(url=url, filename=chapter_path) break except Exception as ex: error_msg = f"Encountered exception when downloading {url}: {ex}" logger.warning(error_msg) if i < self.num_retries: logger.info(f"Retry {i} for url {url}") time.sleep(10) continue if self.exit_on_error: raise RuntimeError(error_msg) if isinstance(ex, urllib.error.URLError): error_reason = ex.reason else: error_reason = repr(ex) error_data = { "url": url, "chapter_filepath": chapter_filepath, "error_reason": error_reason, "utterances": utterances, } return [DataEntry(data=error_data)] chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate) chapter_duration = librosa.get_duration(y=chapter_audio, sr=sr) original_duration = data_entry["duration"] duration_diff = abs(chapter_duration - original_duration) if duration_diff > 0.1: error_msg = f"Duration mismatch for {url}: original duration={original_duration}; " \ f"downloaded duration={round(chapter_duration, 2)}" logger.warning(error_msg) if self.exit_on_error: raise RuntimeError(error_msg) error_data = { "url": url, "chapter_filepath": chapter_filepath, "error_reason": error_msg, "utterances": utterances, } return [DataEntry(data=error_data)] for utt in utterances: audio_filepath = utt["audio_filepath"] audio_path = self.audio_dir / audio_filepath offset = utt["offset"] dur = utt["duration"] start_sample = librosa.time_to_samples(offset, sr=sr) end_sample = librosa.time_to_samples(offset + dur, sr=sr) audio = chapter_audio[start_sample:end_sample] sf.write(file=audio_path, data=audio, samplerate=int(sr)) if self.delete_chapter_files: chapter_path.unlink() return []