Source code for sdp.processors.huggingface.create_initial_manifest

import os
import glob

import soundfile as sf

from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.logging import logger
from typing import Optional

[docs] class CreateInitialManifestHuggingFace(BaseParallelProcessor): """Processor to create initial manifest for HuggingFace dataset. Downloads HuggingFace dataset and creates an initial manifest. Args: dataset_name (str): the name of the dataset. E.g., "tarteel-ai/everyayah" raw_data_dir (str): the path to the directory containing the raw dataset files. resampled_audio_dir (str): directory where the resampled audio will be saved. data_split (str): "train", "validation" or "test". already_downloaded (bool): if True, we will not try to load dataset from HuggingFace. Defaults to False. target_samplerate (int): sample rate (Hz) to use for resampling. Defaults to 16000. Returns: This processor generates an initial manifest file with the following fields:: { "audio_filepath": <path to the audio file>, "duration": <duration of the audio in seconds>, "text": <transcription (with capitalization and punctuation)>, } """ def __init__( self, dataset_name: str, resampled_audio_dir: str, data_split: str, raw_data_dir: Optional[str] = None, already_downloaded: bool = False, target_samplerate: int = 16000, **kwargs, ): super().__init__(**kwargs) self.data_split = data_split self.target_samplerate = target_samplerate self.resampled_audio_dir = resampled_audio_dir self.dataset_name = dataset_name self.raw_data_dir = raw_data_dir self.already_downloaded = already_downloaded def prepare(self): os.makedirs(self.resampled_audio_dir, exist_ok=True) def read_manifest(self): import datasets # checking if dataset should be loaded from disk if self.already_downloaded: if os.path.exists(self.raw_data_dir): hf_files = glob.glob(f'{self.raw_data_dir}/*.hf') self.dataset = datasets.load_from_disk(os.path.join(self.raw_data_dir, hf_files[0])) else: logger.info("Dataset not found locally. Initiating download from Hugging Face.") else: logger.info(f"Initiating download of dataset '{self.dataset_name}' from Hugging Face.") self.dataset = datasets.load_dataset(self.dataset_name, split=self.data_split) logger.info(f"Finished download of dataset '{self.dataset_name}' from Hugging Face.") return range(0, len(self.dataset)) def process_dataset_entry(self, data_id): sample_data = self.dataset[data_id] sample_audio = sample_data["audio"]["array"] audio_filepath = os.path.join(self.resampled_audio_dir, f"{data_id}.wav") sf.write( audio_filepath, sample_audio, self.target_samplerate, ) duration = len(sample_audio) / self.target_samplerate text = sample_data["text"] return [ DataEntry( data={ "audio_filepath": os.path.join("audios", f"{data_id}.wav"), "duration": duration, "text": text, } ) ]