Source code for sdp.processors.datasets.librispeech.create_initial_manifest

# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import fnmatch
import glob
import json
import os
import typing

from sdp.processors.base_processor import BaseProcessor
from sdp.utils.common import download_file, extract_archive


def get_librispeech_url_list(split: str) -> str:
    urls = {
        "dev-clean": "https://openslr.org/resources/12/dev-clean.tar.gz",
        "dev-other": "https://openslr.org/resources/12/dev-other.tar.gz",
        "test-clean": "https://openslr.org/resources/12/test-clean.tar.gz",
        "test-other": "https://openslr.org/resources/12/test-other.tar.gz",
        "train-clean-100": "https://openslr.org/resources/12/train-clean-100.tar.gz",
        "train-clean-360": "https://openslr.org/resources/12/train-clean-360.tar.gz",
        "train-other-500": "https://openslr.org/resources/12/train-other-500.tar.gz",
        "dev-clean-2": "https://www.openslr.org/resources/31/dev-clean-2.tar.gz",
        "train-clean-5": "https://www.openslr.org/resources/31/train-clean-5.tar.gz",
    }

    if split not in urls:
        valid_splits = ", ".join(urls.keys())
        raise ValueError(f"Invalid dataset split '{split}'. Valid options are: {valid_splits}")

    return urls[split]



[docs]
class CreateInitialManifestLibrispeech(BaseProcessor):
    """Processor to create initial manifest for the Librispeech dataset.

    Dataset link: https://openslr.org/12
    Dataset link: https://openslr.org/31

    Will download all files, extract tars, and create a manifest file with the
    "audio_filepath" and "text" fields.

    Args:
        split (str): Which datasets or their combinations should be processed.
            Options are:

            - ``"dev-clean"``
            - ``"dev-other"``
            - ``"test-clean"``
            - ``"test-other"``
            - ``"train-clean-100"``
            - ``"train-clean-360"``
            - ``"train-other-500"``
            - ``"dev-clean-2"``
            - ``"train-clean-5"``

        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.

    Returns:
        This processor generates an initial manifest file with the following fields::

            {
                "audio_filepath": <path to the audio file>,
                "text": <transcription>,
            }
    """

    def __init__(
        self,
        split: str,
        raw_data_dir: str,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.split = split
        self.raw_data_dir = raw_data_dir

    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
        """Parse transcript file and put it inside manifest
        We assume that flac files are located in the same directory as transcript file.
        """

        entries = []
        root = os.path.dirname(file_path)

        print(f"Processing transcript file: {file_path}") 
        with open(file_path, encoding="utf-8") as fin:
            for line in fin:
                id, text = line[: line.index(" ")], line[line.index(" ") + 1 :]
                transcript_text = text.strip()

                flac_file = os.path.join(root, id + ".flac")

                entry = {}
                entry["audio_filepath"] = os.path.abspath(flac_file)
                entry["text"] = transcript_text
                entries.append(entry)
        return entries

    def process_data(self, data_folder: str, manifest_file: str) -> None:
        split_folder = os.path.join(data_folder, "LibriSpeech", self.split)
        files = []
        entries = []
        if not os.path.exists(split_folder):
            raise FileNotFoundError(f"Directory for split '{self.split}' not found at {split_folder}")

        for root, _, filenames in os.walk(split_folder):
            for filename in fnmatch.filter(filenames, "*.trans.txt"):
                files.append(os.path.join(root, filename))

        for file in files:
            entries.extend(self.process_transcript(file))

        with open(manifest_file, "w") as fout:
            for entry in entries:
                fout.write(json.dumps(entry) + "\n")

    def download_extract_files(self, dst_folder: str) -> None:
        """downloading and extracting files"""

        os.makedirs(dst_folder, exist_ok=True)

        download_file(get_librispeech_url_list(self.split), str(dst_folder))
        data_file = f'{dst_folder}/{self.split}.tar.gz'
        extract_archive(str(data_file), str(dst_folder), force_extract=True)

    def process(self):
        self.download_extract_files(self.raw_data_dir)
        self.process_data(self.raw_data_dir, self.output_manifest_file)