Source code for sdp.processors.datasets.mls.create_initial_manifest

# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from pathlib import Path
from typing import Optional

import librosa
from sox import Transformer

from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import download_file, extract_archive

MLS_URL_NO_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}.tar.gz"
MLS_URL_OPUS = "https://dl.fbaipublicfiles.com/mls/mls_{language}_opus.tar.gz"



[docs]
class CreateInitialManifestMLS(BaseParallelProcessor):
    """Processor to create initial manifest for the Multilingual LibriSpeech (MLS) dataset.

    Dataset link: https://www.openslr.org/94/

    Downloads and unzips raw MLS data for the specified language,
    and creates an initial manifest using the transcripts provided in the raw data.

    Args:
        raw_data_dir (str): the directory where the downloaded data will be/is saved.
            This is also where the extracted and processed data will be.
        language (str): the language of the data you wish to be downloaded.
            This will be used to format the URL from which we attempt to download the data.
            E.g., "english", "italian", "spanish", etc.
        data_split (str): "train", "dev" or "test".
        resampled_audio_dir (str or None): if specified, the directory where the resampled
            wav files will be stored. If not specified, the audio will not be resampled and
            the parameters ``target_samplerate`` and ``target_nchannels`` will be ignored.
        target_samplerate (int): sample rate (Hz) to use for resampling. This parameter will
            be ignored if ``resampled_audio_dir`` is ``None``.
            Defaults to 16000.
        target_nchannels (int): number of channels to create during resampling process. This
            parameter will be ignored if ``resampled_audio_dir`` is ``None``.
            Defaults to 1.
        use_opus_archive (bool): if ``True``, will use the version of the archive file which
            contains audio files saved in the OPUS format, instead of FLAC. The OPUS files take up
            less memory than the FLAC files, at the cost of the OPUS files being lower quality than
            the FLAC files.
            If ``True``, the parameter ``resampled_audio_dir`` must be ``None``, as resampling OPUS
            audio files is currently not supported.
            Defaults to False.

    Returns:
        This processor generates an initial manifest file with the following fields::

            {
                "audio_filepath": <path to the audio file>,
                "duration": <duration of the audio in seconds>,
                "text": <transcription>,
            }
    """

    def __init__(
        self,
        raw_data_dir: str,
        language: str,
        data_split: str,
        resampled_audio_dir: Optional[str],
        target_samplerate: int = 16000,
        target_nchannels: int = 1,
        use_opus_archive: bool = False,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.raw_data_dir = Path(raw_data_dir)
        self.language = language
        self.data_split = data_split
        self.resampled_audio_dir = Path(resampled_audio_dir) if resampled_audio_dir else None
        self.target_samplerate = target_samplerate
        self.target_nchannels = target_nchannels
        self.use_opus_archive = use_opus_archive

        # validate params
        if self.use_opus_archive and self.resampled_audio_dir:
            raise ValueError(
                f"`use_opus_archive` is True and `resampled_audio_dir` is not None, but we currently do not"
                " support resampling OPUS-format audio, please either set `use_opus_archive` to False or"
                " resampled_audio_dir to None."
            )

        if not resampled_audio_dir:
            logger.info(
                "`resampled_audio_dir` is None => will not attempt to resample audio. Please note if you have"
                " specified `target_samplerate` or `target_nchannels`, they will be ignored."
            )



        # will be initialized in self.prepare method
        self.audio_path_prefix = None
        self.transcription_file = None


[docs]
    def prepare(self):
        """Downloading and extracting data (unless already done)."""
        os.makedirs(self.raw_data_dir, exist_ok=True)

        if self.use_opus_archive:
            url = MLS_URL_OPUS.format(language=self.language)
            if not (self.raw_data_dir / f"mls_{self.language}_opus.tar.gz").exists():
                download_file(url, str(self.raw_data_dir))

        else:
            url = MLS_URL_NO_OPUS.format(language=self.language)
            if not (self.raw_data_dir / f"mls_{self.language}.tar.gz").exists():
                download_file(url, str(self.raw_data_dir))

        data_folder = extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))

        self.audio_path_prefix = str(Path(data_folder) / self.data_split / "audio")
        self.transcription_file = str(Path(data_folder) / self.data_split / "transcripts.txt")



[docs]
    def read_manifest(self):
        """Reading the initial data line-by-line."""
        if self.transcription_file is None:
            raise RuntimeError("self.process has to be called before processing the data.")

        with open(self.transcription_file, "rt", encoding="utf8") as fin:
            dataset_entries = fin.readlines()

        return dataset_entries



[docs]
    def process_dataset_entry(self, data_entry: str):
        """Processing the data entries.

        Converts all audio into wav format and outputs filepath, duration and
        transcription text.
        """
        if len(data_entry.split("\t")) != 2:
            raise RuntimeError(f"have more than one tab in line {data_entry}")

        utt_id, text = data_entry.split("\t")
        transcript_text = text.strip()

        # specify src_audio_path
        if self.use_opus_archive:
            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".opus")
        else:
            src_audio_path = os.path.join(self.audio_path_prefix, *utt_id.split("_")[:2], utt_id + ".flac")

        # specify tgt_audio_path
        if self.resampled_audio_dir:
            tgt_audio_path = os.path.join(self.resampled_audio_dir, *utt_id.split("_")[:2], utt_id + ".wav")

            if not os.path.exists(os.path.dirname(tgt_audio_path)):
                os.makedirs(os.path.dirname(tgt_audio_path), exist_ok=True)
            if not os.path.exists(tgt_audio_path):
                tfm = Transformer()
                tfm.rate(samplerate=self.target_samplerate)
                tfm.channels(n_channels=self.target_nchannels)
                tfm.build(input_filepath=src_audio_path, output_filepath=tgt_audio_path)

        else:
            tgt_audio_path = src_audio_path

        data = {
            "audio_filepath": tgt_audio_path,
            "duration": float(librosa.get_duration(path=tgt_audio_path)),
            "text": transcript_text,
        }

        return [DataEntry(data=data)]