Source code for sdp.processors.datasets.mtedx.create_initial_manifest

import os
from pathlib import Path
from typing import List
import librosa
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import download_file, extract_archive

MTEDX_URL = "https://www.openslr.org/resources/100/mtedx_{language_id}.tgz"

[docs] class CreateInitialManifestMTEDX(BaseParallelProcessor): """Processor to create initial manifest for the Multilingual TEDx (MTedX dataset. Dataset link: https://www.openslr.org/100/ Downloads dataset for the specified language and creates initial manifest with the provided audio and vtt files. Args: raw_data_dir (str): the directory where the downloaded data will be/is saved. This is also where the extracted and processed data will be. data_split (str): "train", "dev" or "test". language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc. target_samplerate (int): sample rate (Hz) to use for resampling. already_extracted: (bool): if True, we will not try to extract the raw data. Defaults to False. Returns: This processor generates an initial manifest file with the following fields:: { "audio_filepath": <path to the audio file>, "vtt_filepath": <path to the corresponding vtt file> "duration": <duration of the audio in seconds> } """ def __init__( self, raw_data_dir: str, language_id: str, data_split: str, already_extracted: bool = False, **kwargs, ): super().__init__(**kwargs) self.raw_data_dir = Path(raw_data_dir) self.language_id = language_id self.data_split = data_split self.already_extracted = already_extracted def prepare(self): """Downloading and extracting data (unless already done).""" os.makedirs(self.raw_data_dir, exist_ok=True) url = MTEDX_URL.format(language_id=self.language_id) if not (self.raw_data_dir / f"mtedx_{self.language_id}.tgz").exists(): download_file(url, str(self.raw_data_dir)) if not self.already_extracted: extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir)) data_folder = Path(self.raw_data_dir) / f"{self.language_id}-{self.language_id}"/ "data"/ self.data_split self.audio_path_prefix = Path(data_folder) / "wav" self.vtt_path_prefix = Path(data_folder) / "vtt" def read_manifest(self): """Creating entries of initial manifest with flac and vtt files""" audio_filepaths = [] for audio_file in os.listdir(self.audio_path_prefix): vtt_filepath = os.path.join(self.vtt_path_prefix, audio_file.split('.')[0] + "." + self.language_id + ".vtt") audio_filepath = os.path.join(self.audio_path_prefix, audio_file) audio_filepaths.append((audio_filepath, vtt_filepath)) return audio_filepaths def process_dataset_entry(self, data_entry) -> List[DataEntry]: """Processing the data entries.""" audio_filepath, vtt_filepath = data_entry data = { 'audio_filepath': audio_filepath, 'vtt_filepath': vtt_filepath, 'duration': float(librosa.get_duration(path=audio_filepath)), } return [DataEntry(data=data)]