Source code for sdp.processors.datasets.hifitts2.download_dataset
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import librosa
from pathlib import Path
import soundfile as sf
import time
import urllib.error
import urllib.request
from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
[docs]
class DownloadHiFiTTS2(BaseParallelProcessor):
"""
Downloads HiFiTTS-2 dataset to local machine. Unsegmented audiobook chapters are first downloaded at a
48 kHz from LibriVox. Each chapter is then split into segmented utterance files based on precomputed
offsets and durations.
To reduce disk use, the chapter files can be optionally deleted after they are segmented.
Metadata for chapters which fail to download due to network errors are stored in an output manifest file,
which can be given as input to this processor to attempt the downloads again.
Args:
audio_dir (str): Root directory where utterance files will be saved.
chapter_dir (str): Root directory where audiobook chapter files will be saved.
sample_rate (int): Sample rate to use for utterance files.
delete_chapter_files (bool): Whether to delete each chapter file after it is done being processed.
exit_on_error (bool): Whether to terminate the entire processor script if a single chapter downlaod fails.
num_retries (int): Number of times to retry chapter download after encountering intermittent HTTP errors.
Returns:
Utterance files are stored under 'audio_dir' and chapter files are downloaded under 'chapter_dir'.
If exit_on_error is False, then an output manifest will be saved with manifest entries that fail to downlaod,
with error information stored under the 'error_code' and 'error_reason' fields.
Example:
.. code-block:: yaml
- _target_: sdp.processors.DownloadHiFiTTS2
input_manifest_file: ${workspace_dir}/manifest_22khz.json
output_manifest_file: ${workspace_dir}/errors_22khz.json
audio_dir: ${workspace_dir}/audio_22khz
chapter_dir: ${workspace_dir}/chapters
max_workers: 8
"""
def __init__(
self,
audio_dir: str,
chapter_dir: str,
sample_rate: int = 22050,
delete_chapter_files: bool = True,
exit_on_error: bool = False,
num_retries: int = 5,
**kwargs,
):
super().__init__(**kwargs)
self.audio_dir = Path(audio_dir)
self.chapter_dir = Path(chapter_dir)
self.sample_rate = sample_rate
self.delete_chapter_files = delete_chapter_files
self.exit_on_error = exit_on_error
self.num_retries = num_retries
def prepare(self):
# Create output directory structure
with open(self.input_manifest_file, "rt", encoding="utf-8") as fin:
dirs = set()
for line in fin:
row = json.loads(line)
audio_filepath = Path(row["utterances"][0]["audio_filepath"])
chapter_dir = audio_filepath.parent
dirs.add(chapter_dir)
for dir in dirs:
audio_dir = self.audio_dir / dir
chapter_dir = self.chapter_dir / dir
audio_dir.mkdir(exist_ok=True, parents=True)
chapter_dir.mkdir(exist_ok=True, parents=True)
return
def process_dataset_entry(self, data_entry):
url = data_entry["url"]
chapter_filepath = data_entry["chapter_filepath"]
utterances = data_entry["utterances"]
chapter_path = self.chapter_dir / chapter_filepath
for i in range(1, self.num_retries + 1):
try:
urllib.request.urlretrieve(url=url, filename=chapter_path)
break
except Exception as ex:
error_msg = f"Encountered exception when downloading {url}: {ex}"
logger.warning(error_msg)
if i < self.num_retries:
logger.info(f"Retry {i} for url {url}")
time.sleep(10)
continue
if self.exit_on_error:
raise RuntimeError(error_msg)
if isinstance(ex, urllib.error.URLError):
error_reason = ex.reason
else:
error_reason = repr(ex)
error_data = {
"url": url,
"chapter_filepath": chapter_filepath,
"error_reason": error_reason,
"utterances": utterances,
}
return [DataEntry(data=error_data)]
chapter_audio, sr = librosa.load(path=chapter_path, sr=self.sample_rate)
chapter_duration = librosa.get_duration(y=chapter_audio, sr=sr)
original_duration = data_entry["duration"]
duration_diff = abs(chapter_duration - original_duration)
if duration_diff > 0.1:
error_msg = f"Duration mismatch for {url}: original duration={original_duration}; " \
f"downloaded duration={round(chapter_duration, 2)}"
logger.warning(error_msg)
if self.exit_on_error:
raise RuntimeError(error_msg)
error_data = {
"url": url,
"chapter_filepath": chapter_filepath,
"error_reason": error_msg,
"utterances": utterances,
}
return [DataEntry(data=error_data)]
for utt in utterances:
audio_filepath = utt["audio_filepath"]
audio_path = self.audio_dir / audio_filepath
offset = utt["offset"]
dur = utt["duration"]
start_sample = librosa.time_to_samples(offset, sr=sr)
end_sample = librosa.time_to_samples(offset + dur, sr=sr)
audio = chapter_audio[start_sample:end_sample]
sf.write(file=audio_path, data=audio, samplerate=int(sr))
if self.delete_chapter_files:
chapter_path.unlink()
return []