Source code for sdp.processors.datasets.uzbekvoice.create_initial_manifest
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import json
import os
import typing
import gdown
from sdp.processors.base_processor import BaseProcessor
from sdp.utils.common import extract_archive
from sdp.logging import logger
[docs]
class CreateInitialManifestUzbekvoice(BaseProcessor):
URL = "https://drive.google.com/drive/folders/18N5i7GD0LmUnNQok6BP3EC8PYov7pZDW"
"""
Processor to create initial manifest for the Uzbekvoice dataset.
Dataset link: https://uzbekvoice.ai/en-US
Will download all files, extract them, and create a manifest file with the
"audio_filepath", "text" and "duration" fields.
Args:
raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.
Returns:
This processor generates an initial manifest file with the following fields::
{
"audio_filepath": <path to the audio file>,
"text": <transcription>,
}
"""
def __init__(
self,
raw_data_dir: str,
**kwargs,
):
super().__init__(**kwargs)
self.raw_data_dir = raw_data_dir
def download_extract_files(self, dst_folder: str) -> None:
"""downloading and extracting files"""
os.makedirs(dst_folder, exist_ok=True)
# downloading all files
# for big files google drive doesn't allow to try downlaoding them more than once
# so, in case of receiveing gdown error we need to download them manually
#check if clisp.zip and uzbekvoice-dataset.zip are already in dst_folder
if os.path.exists(os.path.join(dst_folder, 'clips.zip')) and os.path.exists(os.path.join(dst_folder, 'uzbekvoice-dataset.zip')):
print("Files already exist in the folder. Skipping download.")
else:
print(f"Downloading files from {self.URL}...")
try:
gdown.download_folder(self.URL, output=dst_folder)
except Exception as e:
print("Error occured while downloading files from google drive. Please download them manually.")
print("URL: ", self.URL)
print("Error: ", e)
for file in glob.glob(os.path.join(dst_folder, '*.zip')):
extract_archive(file, str(dst_folder), force_extract=True)
print(f"Extracted {file}")
def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
"""
Parse transcript JSON file and put it inside manifest.
"""
entries = []
root = os.path.join(self.raw_data_dir, 'clips')
number_of_entries = 0
total_duration = 0
# parse json file and collect audio file path, transcript and lenght in entries
with open(file_path, encoding="utf-8") as fin:
data = json.load(fin)
for entry in data:
audio_file = os.path.join(root, entry["client_id"], entry["original_sentence_id"] + '.mp3')
transcript = entry["original_sentence"]
utter_length = entry["clip_duration"]
number_of_entries += 1
entries.append(
{
"audio_filepath": os.path.abspath(audio_file),
"text": transcript,
"duration": utter_length
}
)
logger.info("Total number of entries after processing: %d", number_of_entries)
logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)
return entries
def process_data(self, data_folder: str, manifest_file: str) -> None:
entries = self.process_transcript(os.path.join(data_folder, "uzbekvoice-dataset", "voice_dataset.json"))
with open(manifest_file, "w", encoding="utf-8") as fout:
for m in entries:
fout.write(json.dumps(m, ensure_ascii=False) + "\n")
def process(self):
self.download_extract_files(self.raw_data_dir)
self.process_data(self.raw_data_dir, self.output_manifest_file)