Source code for sdp.processors.datasets.mcv.create_initial_manifest

# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
# For example sudo apt-get install libsox-fmt-mp3
import csv
import glob
import os
from pathlib import Path
from typing import Tuple

import sox
from sox import Transformer
from tqdm.contrib.concurrent import process_map

from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import extract_archive



[docs]
class CreateInitialManifestMCV(BaseParallelProcessor):
    """Processor to create initial manifest for the Mozilla Common Voice (MCV) dataset.

    Dataset link: https://commonvoice.mozilla.org/

    Extracts raw MCV data for the specified language and creates an initial manifest
    using the transcripts provided in the raw data.

    Args:
        raw_data_dir (str): the path to the directory containing the raw data archive file.
            Needs to be manually downloaded from https://commonvoice.mozilla.org/.
        extract_archive_dir (str): directory where the extracted data will be saved.
        resampled_audio_dir (str): directory where the resampled audio will be saved.
        data_split (str): "train", "dev" or "test".
        language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
        already_extracted (bool): if True, we will not try to extract the raw data.
            Defaults to False.
        target_samplerate (int): sample rate (Hz) to use for resampling.
            Defaults to 16000.
        target_nchannels (int): number of channels to create during resampling process.
            Defaults to 1.

    Returns:
        This processor generates an initial manifest file with the following fields::

            {
                "audio_filepath": <path to the audio file>,
                "duration": <duration of the audio in seconds>,
                "text": <transcription (with capitalization and punctuation)>,
            }
    """

    def __init__(
        self,
        raw_data_dir: str,
        extract_archive_dir: str,
        resampled_audio_dir: str,
        data_split: str,
        language_id: str,
        already_extracted: bool = False,
        target_samplerate: int = 16000,
        target_nchannels: int = 1,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.raw_data_dir = Path(raw_data_dir)
        self.extract_archive_dir = extract_archive_dir
        self.resampled_audio_dir = resampled_audio_dir
        self.data_split = data_split
        self.language_id = language_id
        self.already_extracted = already_extracted
        self.target_samplerate = target_samplerate
        self.target_nchannels = target_nchannels

    def prepare(self):
        """Extracting data (unless already done)."""
        os.makedirs(self.raw_data_dir, exist_ok=True)

        if not self.already_extracted:
            tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*{self.language_id}.tar.gz")
            if not tar_gz_files:
                raise RuntimeError(
                    f"Did not find any file matching {self.raw_data_dir}/*.tar.gz. "
                    "For MCV dataset we cannot automatically download the data, so "
                    "make sure to get the data from https://commonvoice.mozilla.org/ "
                    "and put it in the 'raw_data_dir' folder."
                )
            elif len(tar_gz_files) > 1:
                raise RuntimeError(
                    f"Expecting exactly one *{self.language_id}.tar.gz file in directory {self.raw_data_dir}"
                )

            data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
            self.transcription_file = Path(data_folder)
        else:
            self.transcription_file = Path(self.extract_archive_dir) / self.language_id
        self.audio_path_prefix = str(self.transcription_file / "clips")
        self.transcription_file = str(self.transcription_file / (self.data_split + ".tsv"))
        os.makedirs(self.resampled_audio_dir, exist_ok=True)

    def read_manifest(self):
        if self.transcription_file is None:
            raise RuntimeError("self.process has to be called before processing the data.")

        with open(self.transcription_file, "rt", encoding="utf8") as csvfile:
            reader = csv.DictReader(csvfile, delimiter="\t")
            next(reader, None)  # skip the headers
            dataset_entries = [(row["path"], row["sentence"]) for row in reader]
        return dataset_entries

    def process_dataset_entry(self, data_entry: Tuple[str, str]):
        file_path, text = data_entry
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        transcript_text = text.strip()

        audio_path = os.path.join(self.audio_path_prefix, file_path)
        output_wav_path = os.path.join(self.resampled_audio_dir, file_name + ".wav")

        if not os.path.exists(output_wav_path):
            tfm = Transformer()
            tfm.rate(samplerate=self.target_samplerate)
            tfm.channels(n_channels=self.target_nchannels)
            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)

        data = {
            "audio_filepath": output_wav_path,
            "duration": float(sox.file_info.duration(output_wav_path)),
            "text": transcript_text,
        }

        return [DataEntry(data=data)]