Source code for sdp.processors.datasets.fleurs.create_initial_manifest

# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import fnmatch
import glob
import json
import os
import shutil
import typing
from urllib.parse import parse_qs, urlparse

from sdp.processors.base_processor import BaseProcessor, DataEntry
from sdp.utils.common import download_file, extract_archive


def get_fleurs_url_list(lang: str, split: str) -> list[str]:
    # examples
    # "https://huggingface.co/datasets/google/fleurs/resolve/main/data/hy_am/audio/dev.tar.gz",
    # "https://huggingface.co/datasets/google/fleurs/resolve/main/data/hy_am/dev.tsv"

    urls = []
    base_url = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"

    base_lang_url = os.path.join(base_url, lang)
    tsv_url = f"{base_lang_url}/{split}.tsv"
    urls.append(tsv_url)

    tar_gz_url = f"{base_lang_url}/audio/{split}.tar.gz"
    urls.append(tar_gz_url)

    return urls



[docs]
class CreateInitialManifestFleurs(BaseProcessor):
    """
    Processor to create initial manifest for the FLEURS dataset.

    Dataset link: https://huggingface.co/datasets/google/fleurs

    Will download all files, extract them, and create a manifest file with the
    "audio_filepath" and "text" fields.

    Args:
        lang (str): Language to be processed, identified by a combination of ISO 639-1 and ISO 3166-1 alpha-2 codes.
            Examples are:

            - ``"hy_am"`` for Armenian
            - ``"ko_kr"`` for Korean

        split (str): Which dataset splits to process.
            Options are:

            - ``"test"``
            - ``"train"``
            - ``"dev"``

        raw_data_dir (str): Path to the folder where the data archive should be downloaded and extracted.

    Returns:
        This processor generates an initial manifest file with the following fields::

            {
                "audio_filepath": <path to the audio file>,
                "text": <transcription>,
            }
    """

    def __init__(
        self,
        lang: str,
        split: str,
        raw_data_dir: str,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.lang = lang
        self.split = split
        self.raw_data_dir = raw_data_dir

    def process_transcript(self, file_path: str) -> list[dict[str, typing.Any]]:
        """
        Parse transcript TSV file and put it inside manifest.
        Assumes the TSV file has two columns: file name and text.
        """

        entries = []
        root = os.path.dirname(file_path)

        with open(file_path, encoding="utf-8") as fin:
            for line in fin:
                # Split the line into filename text using the tab delimiter
                parts = line.strip().split('\t')
                if len(parts) < 2:  # Skip lines that don't have at least 2 parts
                    continue

                file_name, transcript_text = parts[1], parts[2]
                wav_file = os.path.join(root, file_name)

                entry = {"audio_filepath": os.path.abspath(wav_file), "text": transcript_text}
                entries.append(entry)

        return entries

    def process_data(self, data_folder: str, manifest_file: str) -> None:
        entries = self.process_transcript(os.path.join(data_folder, self.split + "/" + self.split + ".tsv"))

        with open(manifest_file, "w", encoding="utf-8") as fout:
            for m in entries:
                fout.write(json.dumps(m, ensure_ascii=False) + "\n")

    def download_extract_files(self, dst_folder: str) -> None:
        """downloading and extracting files"""

        os.makedirs(dst_folder, exist_ok=True)

        # downloading all files
        for file_url in get_fleurs_url_list(self.lang, self.split):
            download_file(file_url, str(dst_folder))

        extract_archive(f'{dst_folder}/{self.split}.tar.gz', str(dst_folder), force_extract=True)

        # Organizing files into their respective folders
        target_folder = os.path.join(dst_folder, self.split)

        file_name = f"{self.split}.tsv"

        file_path = os.path.join(dst_folder, file_name)
        dest_file_path = os.path.join(target_folder, file_name)

        if not os.path.exists(dest_file_path):
            shutil.move(file_path, dest_file_path)
            print(f'Moved {file_path} to {dest_file_path}')
        else:
            os.remove(file_path)
            print(f'File {file_name} already exists in {target_folder}, deleted from source.')

    def process(self):
        self.download_extract_files(self.raw_data_dir)
        self.process_data(self.raw_data_dir, self.output_manifest_file)