Source code for sdp.processors.datasets.mtedx.create_initial_manifest

import os
from pathlib import Path
from typing import List
import librosa
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.utils.common import download_file, extract_archive

MTEDX_URL = "https://www.openslr.org/resources/100/mtedx_{language_id}.tgz"


[docs]
class CreateInitialManifestMTEDX(BaseParallelProcessor):
    """Processor to create initial manifest for the Multilingual TEDx (MTedX dataset.

        Dataset link: https://www.openslr.org/100/

        Downloads dataset for the specified language and creates initial manifest with the provided
        audio and vtt files.

        Args:
            raw_data_dir (str): the directory where the downloaded data will be/is saved.
                                This is also where the extracted and processed data will be.
            data_split (str): "train", "dev" or "test".
            language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc.
            target_samplerate (int): sample rate (Hz) to use for resampling.
            already_extracted: (bool): if True, we will not try to extract the raw data.
                Defaults to False.

        Returns:
            This processor generates an initial manifest file with the following fields::

                {
                    "audio_filepath": <path to the audio file>,
                    "vtt_filepath": <path to the corresponding vtt file>
                    "duration": <duration of the audio in seconds>
                }
        """
    def __init__(
            self,
            raw_data_dir: str,
            language_id: str,
            data_split: str,
            already_extracted: bool = False,
            **kwargs,
    ):
        super().__init__(**kwargs)
        self.raw_data_dir = Path(raw_data_dir)
        self.language_id = language_id
        self.data_split = data_split
        self.already_extracted = already_extracted

    def prepare(self):
        """Downloading and extracting data (unless already done)."""
        os.makedirs(self.raw_data_dir, exist_ok=True)


        url = MTEDX_URL.format(language_id=self.language_id)
        if not (self.raw_data_dir / f"mtedx_{self.language_id}.tgz").exists():
            download_file(url, str(self.raw_data_dir))

        if not self.already_extracted:
            extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
            
        data_folder = Path(self.raw_data_dir) / f"{self.language_id}-{self.language_id}"/ "data"/ self.data_split
        self.audio_path_prefix = Path(data_folder) / "wav"
        self.vtt_path_prefix = Path(data_folder) / "vtt"

    def read_manifest(self):
        """Creating entries of initial manifest with flac and vtt files"""
        audio_filepaths = []
        for audio_file in os.listdir(self.audio_path_prefix):
            vtt_filepath = os.path.join(self.vtt_path_prefix, audio_file.split('.')[0] + "." + self.language_id  + ".vtt")
            audio_filepath = os.path.join(self.audio_path_prefix, audio_file)
            audio_filepaths.append((audio_filepath, vtt_filepath))
        return audio_filepaths

    def process_dataset_entry(self, data_entry) -> List[DataEntry]:
        """Processing the data entries."""
        audio_filepath, vtt_filepath = data_entry

        data = {
            'audio_filepath': audio_filepath,
            'vtt_filepath': vtt_filepath,
            'duration': float(librosa.get_duration(path=audio_filepath)),
        }
        return [DataEntry(data=data)]