Source code for sdp.processors.datasets.slr83.create_initial_manifest

# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import sox
from tqdm import tqdm

from sdp.logging import logger
from sdp.processors.base_processor import (
    BaseParallelProcessor,
    BaseProcessor,
    DataEntry,
)
from sdp.utils.common import download_file, extract_archive

DATASET_URL = "https://www.openslr.org/resources/83/{dialect}.zip"

AVAILABLE_DIALECTS = [
    'irish_english_male',
    'midlands_english_female',
    'midlands_english_male',
    'northern_english_female',
    'northern_english_male',
    'scottish_english_female',
    'scottish_english_male',
    'southern_english_female',
    'southern_english_male',
    'welsh_english_female',
    'welsh_english_male',
]

EXPECTED_SPLIT_STATS = {
    ('irish_english_male', 'test'): (102, 604.757),
    ('irish_english_male', 'train'): (293, 1656.917),
    ('irish_english_male', 'dev'): (53, 302.763),
    ('midlands_english_female', 'test'): (90, 608.341),
    ('midlands_english_female', 'train'): (94, 636.843),
    ('midlands_english_female', 'dev'): (45, 306.261),
    ('midlands_english_male', 'test'): (106, 604.672),
    ('midlands_english_male', 'train'): (270, 1568.683),
    ('midlands_english_male', 'dev'): (52, 301.227),
    ('northern_english_female', 'test'): (267, 1803.435),
    ('northern_english_female', 'train'): (330, 2146.816),
    ('northern_english_female', 'dev'): (145, 906.496),
    ('northern_english_male', 'test'): (587, 3607.467),
    ('northern_english_male', 'train'): (1126, 7003.136),
    ('northern_english_male', 'dev'): (298, 1807.957),
    ('scottish_english_female', 'test'): (284, 1801.301),
    ('scottish_english_female', 'train'): (426, 2681.344),
    ('scottish_english_female', 'dev'): (142, 906.24),
    ('scottish_english_male', 'test'): (612, 3603.883),
    ('scottish_english_male', 'train'): (663, 3994.027),
    ('scottish_english_male', 'dev'): (306, 1800.96),
    ('southern_english_female', 'test'): (572, 3600.128),
    ('southern_english_female', 'train'): (3124, 19213.312),
    ('southern_english_female', 'dev'): (293, 1804.8),
    ('southern_english_male', 'test'): (582, 3600.555),
    ('southern_english_male', 'train'): (3295, 20210.773),
    ('southern_english_male', 'dev'): (296, 1807.445),
    ('welsh_english_female', 'test'): (239, 1805.739),
    ('welsh_english_female', 'train'): (774, 5621.675),
    ('welsh_english_female', 'dev'): (125, 905.387),
    ('welsh_english_male', 'test'): (557, 3605.931),
    ('welsh_english_male', 'train'): (726, 4660.651),
    ('welsh_english_male', 'dev'): (286, 1805.909),
}



[docs]
class CreateInitialManifestSLR83(BaseParallelProcessor):
    """Processor to create initial manifest for the SLR83 dataset.

    This is a dataset introduced in `Open-source Multi-speaker Corpora of the
    English Accents in the British Isles <https://aclanthology.org/2020.lrec-1.804/>`_.

    Args:
        raw_data_dir (str): where to put raw downloaded data.
        dialect (str): should be one of the

            * ``irish_english_male``
            * ``midlands_english_female``
            * ``midlands_english_male``
            * ``northern_english_female``
            * ``northern_english_male``
            * ``scottish_english_female``
            * ``scottish_english_male``
            * ``southern_english_female``
            * ``southern_english_male``
            * ``welsh_english_female``
            * ``welsh_english_male``

    Returns:
        This processor generates an initial manifest file with the following fields::

            {
                "audio_filepath": <path to the audio file>,
                "duration": <duration of the audio in seconds>,
                "text": <transcription>,
            }
    """

    def __init__(
        self,
        raw_data_dir: str,
        dialect: str,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.raw_data_dir = Path(raw_data_dir)
        self.dialect = dialect
        if dialect not in AVAILABLE_DIALECTS:
            raise ValueError(f"dialect has to be one of {AVAILABLE_DIALECTS}")

    def prepare(self):
        """Downloading and extracting data (unless already done)."""
        os.makedirs(self.raw_data_dir, exist_ok=True)

        url = DATASET_URL.format(dialect=self.dialect)

        if not (self.raw_data_dir / f"{self.dialect}.zip").exists():
            download_file(url, str(self.raw_data_dir))

        extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir))
        self.transcription_file = str(self.raw_data_dir / "line_index.csv")

    def read_manifest(self):
        if self.transcription_file is None:
            raise RuntimeError("self.process has to be called before processing the data.")

        with open(self.transcription_file, "rt", encoding="utf8") as fin:
            dataset_entries = fin.readlines()

        return dataset_entries

    def process_dataset_entry(self, data_entry: str):
        split_entry = data_entry.split(", ")
        if len(split_entry) != 3:
            raise RuntimeError(f"Input data is badly formatted! Bad line: {data_entry}")

        _, utt_id, transcript_text = split_entry
        audio_path = str(self.raw_data_dir / (utt_id + ".wav"))
        data = {
            "audio_filepath": audio_path,
            "duration": float(sox.file_info.duration(audio_path)),
            "text": transcript_text.strip(),
        }

        return [DataEntry(data=data)]




[docs]
class CustomDataSplitSLR83(BaseProcessor):
    """Splits SLR83 data into train, dev or test subset.

    The original paper does not provide train/dev/test splits, so we include a
    custom processing that can be used as a standardized split to compare
    results. For more details on this data split see `Damage Control During
    Domain Adaptation for Transducer Based Automatic Speech Recognition
    <https://arxiv.org/abs/2210.03255>`_.

    .. note::
        All data dropping has to be done before the split. We will check the
        total number of files to be what is expected in the reference split.
        But if you add any custom pre-processing that changes duration or
        number of files, your splits will likely be different.

    Args:
        dialect (str): same as in the :class:`sdp.processors.CreateInitialManifestSLR83`.
        data_split (str): "train", "dev" or "test".

    Returns:
        All the same fields as in the input manifest, but only a subset of
        the data is retained.
    """

    def __init__(self, dialect, data_split, **kwargs):
        super().__init__(**kwargs)
        self.dialect = dialect
        self.data_split = data_split

    def process(self):
        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
            manifest_data = [json.loads(line) for line in fin.readlines()]

        # sorting and fixing random seed for reproducibility
        manifest_data = sorted(manifest_data, key=lambda x: x['audio_filepath'])
        sample_idxs = list(range(len(manifest_data)))
        rng = np.random.RandomState(0)
        rng.shuffle(sample_idxs)

        duration = sum([x['duration'] for x in manifest_data])
        validation_duration, test_duration = 1800, 3600  # 30 minutes, 1 hour
        if duration <= 3600:  # 1 hour
            validation_duration, test_duration = 300, 600  # 5 minutes, 10 minutes
        elif duration > 3600 and duration <= 9000:  # 2.5 hours
            validation_duration, test_duration = 900, 1800  # 15 minutes, 30 minutes

        split_data = {}
        split_data['dev'] = self._accumulate_samples(manifest_data, sample_idxs, validation_duration)
        split_data['test'] = self._accumulate_samples(manifest_data, sample_idxs, test_duration)
        split_data['train'] = (
            [manifest_data[x] for x in sample_idxs],
            sum([manifest_data[x]['duration'] for x in sample_idxs]),
        )

        for split in ['train', 'dev', 'test']:
            actual_stats = (len(split_data[split][0]), round(split_data[split][1], 3))
            if EXPECTED_SPLIT_STATS[(self.dialect, split)] != actual_stats:
                raise RuntimeError(
                    f"Generated split stats (num files, duration) = {actual_stats}. "
                    f"But expected to see {EXPECTED_SPLIT_STATS[(self.dialect, split)]}. "
                    f"Did you add some custom pre-processing that changes number of files or duration?"
                )

        number_of_entries = 0
        total_duration = 0
        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
            for data_entry in tqdm(split_data[self.data_split][0]):
                json.dump(data_entry, fout, ensure_ascii=False)
                number_of_entries += 1
                total_duration += data_entry["duration"]
                fout.write("\n")

        logger.info("Total number of entries after processing: %d", number_of_entries)
        logger.info("Total audio duration (hours) after processing: %.2f", total_duration / 3600)

    def _accumulate_samples(
        self, manifest_data: List[dict], sample_idxs: List[int], duration_threshold: int
    ) -> Tuple[List[dict], float]:
        """Create a subset of the manifest data having duration less than duration_threshold.

        Args:
            manifest_data: data for the manifest file
            sample_idxs: list of available indices to pick a sample from the manifest data
            duration_threshold: maximum duration of the samples to be included in the subset

        Returns:
            tuple: The accumulated subset of the manifest data and total accumulated duration
        """
        accumulated_data = []
        accumulated_duration = 0
        while accumulated_duration <= duration_threshold:
            sample_idx = sample_idxs.pop(0)
            accumulated_data.append(manifest_data[sample_idx])
            accumulated_duration += manifest_data[sample_idx]['duration']
        return accumulated_data, accumulated_duration