Source code for sdp.processors.datasets.librispeech.create_initial_manifest
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import fnmatch
import glob
import json
import os
import typing
from sdp.processors.base_processor import BaseProcessor
from sdp.utils.common import download_file, extract_archive
def get_librispeech_url_list(split: str) -> str:
urls = {
"dev-clean": "https://openslr.org/resources/12/dev-clean.tar.gz",
"dev-other": "https://openslr.org/resources/12/dev-other.tar.gz",
"test-clean": "https://openslr.org/resources/12/test-clean.tar.gz",
"test-other": "https://openslr.org/resources/12/test-other.tar.gz",
"train-clean-100": "https://openslr.org/resources/12/train-clean-100.tar.gz",
"train-clean-360": "https://openslr.org/resources/12/train-clean-360.tar.gz",
"train-other-500": "https://openslr.org/resources/12/train-other-500.tar.gz",
"dev-clean-2": "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
"train-clean-5": "https://www.openslr.org/resources/31/train-clean-5.tar.gz",
}
if split not in urls:
valid_splits = ", ".join(urls.keys())
raise ValueError(f"Invalid dataset split '{split}'. Valid options are: {valid_splits}")
return urls[split]
[docs]
class CreateInitialManifestLibrispeech(BaseProcessor):
"""Processor to create initial manifest for the Librispeech dataset.
Dataset link: https://openslr.org/12
Dataset link: https://openslr.org/31
Will download all files, extract tars, and create a manifest file with the
"audio_filepath" and "text" fields.
Args:
split (str): Which datasets or their combinations should be processed.
Options are:
- ``"dev-clean"``
- ``"dev-other"``
- ``"test-clean"``
- ``"test-other"``
- ``"train-clean-100"``
- ``"train-clean-360"``
- ``"train-other-500"``
- ``"dev-clean-2"``
- ``"train-clean-5"``
raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
Returns:
This processor generates an initial manifest file with the following fields::
{
"audio_filepath": <path to the audio file>,
"text": <transcription>,
}
"""
def __init__(
self,
split: str,
raw_data_dir: str,
**kwargs,
):
super().__init__(**kwargs)
self.split = split
self.raw_data_dir = raw_data_dir
def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
"""Parse transcript file and put it inside manifest
We assume that flac files are located in the same directory as transcript file.
"""
entries = []
root = os.path.dirname(file_path)
print(f"Processing transcript file: {file_path}")
with open(file_path, encoding="utf-8") as fin:
for line in fin:
id, text = line[: line.index(" ")], line[line.index(" ") + 1 :]
transcript_text = text.strip()
flac_file = os.path.join(root, id + ".flac")
entry = {}
entry["audio_filepath"] = os.path.abspath(flac_file)
entry["text"] = transcript_text
entries.append(entry)
return entries
def process_data(self, data_folder: str, manifest_file: str) -> None:
split_folder = os.path.join(data_folder, "LibriSpeech", self.split)
files = []
entries = []
if not os.path.exists(split_folder):
raise FileNotFoundError(f"Directory for split '{self.split}' not found at {split_folder}")
for root, _, filenames in os.walk(split_folder):
for filename in fnmatch.filter(filenames, "*.trans.txt"):
files.append(os.path.join(root, filename))
for file in files:
entries.extend(self.process_transcript(file))
with open(manifest_file, "w") as fout:
for entry in entries:
fout.write(json.dumps(entry) + "\n")
def download_extract_files(self, dst_folder: str) -> None:
"""downloading and extracting files"""
os.makedirs(dst_folder, exist_ok=True)
download_file(get_librispeech_url_list(self.split), str(dst_folder))
data_file = f'{dst_folder}/{self.split}.tar.gz'
extract_archive(str(data_file), str(dst_folder), force_extract=True)
def process(self):
self.download_extract_files(self.raw_data_dir)
self.process_data(self.raw_data_dir, self.output_manifest_file)