Source code for sdp.processors.huggingface.create_initial_manifest
import os
import glob
import soundfile as sf
from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
from sdp.logging import logger
from typing import Optional
[docs]
class CreateInitialManifestHuggingFace(BaseParallelProcessor):
"""Processor to create initial manifest for HuggingFace dataset.
Downloads HuggingFace dataset and creates an initial manifest.
Args:
dataset_name (str): the name of the dataset. E.g., "tarteel-ai/everyayah"
raw_data_dir (str): the path to the directory containing the raw dataset files.
resampled_audio_dir (str): directory where the resampled audio will be saved.
data_split (str): "train", "validation" or "test".
already_downloaded (bool): if True, we will not try to load dataset from HuggingFace.
Defaults to False.
target_samplerate (int): sample rate (Hz) to use for resampling.
Defaults to 16000.
Returns:
This processor generates an initial manifest file with the following fields::
{
"audio_filepath": <path to the audio file>,
"duration": <duration of the audio in seconds>,
"text": <transcription (with capitalization and punctuation)>,
}
"""
def __init__(
self,
dataset_name: str,
resampled_audio_dir: str,
data_split: str,
raw_data_dir: Optional[str] = None,
already_downloaded: bool = False,
target_samplerate: int = 16000,
**kwargs,
):
super().__init__(**kwargs)
self.data_split = data_split
self.target_samplerate = target_samplerate
self.resampled_audio_dir = resampled_audio_dir
self.dataset_name = dataset_name
self.raw_data_dir = raw_data_dir
self.already_downloaded = already_downloaded
def prepare(self):
os.makedirs(self.resampled_audio_dir, exist_ok=True)
def read_manifest(self):
import datasets
# checking if dataset should be loaded from disk
if self.already_downloaded:
if os.path.exists(self.raw_data_dir):
hf_files = glob.glob(f'{self.raw_data_dir}/*.hf')
self.dataset = datasets.load_from_disk(os.path.join(self.raw_data_dir, hf_files[0]))
else:
logger.info("Dataset not found locally. Initiating download from Hugging Face.")
else:
logger.info(f"Initiating download of dataset '{self.dataset_name}' from Hugging Face.")
self.dataset = datasets.load_dataset(self.dataset_name, split=self.data_split)
logger.info(f"Finished download of dataset '{self.dataset_name}' from Hugging Face.")
return range(0, len(self.dataset))
def process_dataset_entry(self, data_id):
sample_data = self.dataset[data_id]
sample_audio = sample_data["audio"]["array"]
audio_filepath = os.path.join(self.resampled_audio_dir, f"{data_id}.wav")
sf.write(
audio_filepath,
sample_audio,
self.target_samplerate,
)
duration = len(sample_audio) / self.target_samplerate
text = sample_data["text"]
return [
DataEntry(
data={
"audio_filepath": os.path.join("audios", f"{data_id}.wav"),
"duration": duration,
"text": text,
}
)
]