Source code for sdp.processors.datasets.mcv.create_initial_manifest
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# To convert mp3 files to wav using sox, you must have installed sox with mp3 support
# For example sudo apt-get install libsox-fmt-mp3
import csv
import glob
import os
from pathlib import Path
from typing import Tuple
import sox
from sox import Transformer
from tqdm.contrib.concurrent import process_map
from sdp.logging import logger
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import extract_archive
[docs]
class CreateInitialManifestMCV(BaseParallelProcessor):
"""Processor to create initial manifest for the Mozilla Common Voice (MCV) dataset.
Dataset link: https://commonvoice.mozilla.org/
Extracts raw MCV data for the specified language and creates an initial manifest
using the transcripts provided in the raw data.
Args:
raw_data_dir (str): the path to the directory containing the raw data archive file.
Needs to be manually downloaded from https://commonvoice.mozilla.org/.
extract_archive_dir (str): directory where the extracted data will be saved.
resampled_audio_dir (str): directory where the resampled audio will be saved.
data_split (str): "train", "dev" or "test".
language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
already_extracted (bool): if True, we will not try to extract the raw data.
Defaults to False.
target_samplerate (int): sample rate (Hz) to use for resampling.
Defaults to 16000.
target_nchannels (int): number of channels to create during resampling process.
Defaults to 1.
Returns:
This processor generates an initial manifest file with the following fields::
{
"audio_filepath": <path to the audio file>,
"duration": <duration of the audio in seconds>,
"text": <transcription (with capitalization and punctuation)>,
}
"""
def __init__(
self,
raw_data_dir: str,
extract_archive_dir: str,
resampled_audio_dir: str,
data_split: str,
language_id: str,
already_extracted: bool = False,
target_samplerate: int = 16000,
target_nchannels: int = 1,
**kwargs,
):
super().__init__(**kwargs)
self.raw_data_dir = Path(raw_data_dir)
self.extract_archive_dir = extract_archive_dir
self.resampled_audio_dir = resampled_audio_dir
self.data_split = data_split
self.language_id = language_id
self.already_extracted = already_extracted
self.target_samplerate = target_samplerate
self.target_nchannels = target_nchannels
def prepare(self):
"""Extracting data (unless already done)."""
os.makedirs(self.raw_data_dir, exist_ok=True)
if not self.already_extracted:
tar_gz_files = glob.glob(str(self.raw_data_dir) + f"/*{self.language_id}.tar.gz")
if not tar_gz_files:
raise RuntimeError(
f"Did not find any file matching {self.raw_data_dir}/*.tar.gz. "
"For MCV dataset we cannot automatically download the data, so "
"make sure to get the data from https://commonvoice.mozilla.org/ "
"and put it in the 'raw_data_dir' folder."
)
elif len(tar_gz_files) > 1:
raise RuntimeError(
f"Expecting exactly one *{self.language_id}.tar.gz file in directory {self.raw_data_dir}"
)
data_folder = extract_archive(tar_gz_files[0], self.extract_archive_dir)
self.transcription_file = Path(data_folder)
else:
self.transcription_file = Path(self.extract_archive_dir) / self.language_id
self.audio_path_prefix = str(self.transcription_file / "clips")
self.transcription_file = str(self.transcription_file / (self.data_split + ".tsv"))
os.makedirs(self.resampled_audio_dir, exist_ok=True)
def read_manifest(self):
if self.transcription_file is None:
raise RuntimeError("self.process has to be called before processing the data.")
with open(self.transcription_file, "rt", encoding="utf8") as csvfile:
reader = csv.DictReader(csvfile, delimiter="\t")
next(reader, None) # skip the headers
dataset_entries = [(row["path"], row["sentence"]) for row in reader]
return dataset_entries
def process_dataset_entry(self, data_entry: Tuple[str, str]):
file_path, text = data_entry
file_name = os.path.splitext(os.path.basename(file_path))[0]
transcript_text = text.strip()
audio_path = os.path.join(self.audio_path_prefix, file_path)
output_wav_path = os.path.join(self.resampled_audio_dir, file_name + ".wav")
if not os.path.exists(output_wav_path):
tfm = Transformer()
tfm.rate(samplerate=self.target_samplerate)
tfm.channels(n_channels=self.target_nchannels)
tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
data = {
"audio_filepath": output_wav_path,
"duration": float(sox.file_info.duration(output_wav_path)),
"text": transcript_text,
}
return [DataEntry(data=data)]