Source code for sdp.processors.datasets.lhotse

# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json

from sdp.processors.base_processor import BaseProcessor



[docs]
class LhotseImport(BaseProcessor):
    """Processor to create an initial manifest imported from a Lhotse CutSet.
    The ``input_manifest_file`` is expected to point to a Lhotse CutSet manifest,
    which usually has ``cuts`` in its name and a ``.jsonl`` or ``.jsonl.gz`` extension.

    Lhotse is a library for speech data processing and loading; see:

    * https://github.com/lhotse-speech/lhotse
    * https://lhotse.readthedocs.io

    It can be installed using ``pip install lhotse``.

    .. caution:: Currently we only support the importing of cut sets that represent
        single-channel, single-audio-file-per-utterance datasets.

    Returns:
        This processor generates an initial manifest file with the following fields::

            {
                "audio_filepath": <path to the audio file>,
                "duration": <duration of the audio in seconds>,
                "text": <transcription (with capitalization and punctuation)>,
            }
    """

    def process(self):
        from lhotse import CutSet

        cuts = CutSet.from_file(self.input_manifest_file)
        with open(self.output_manifest_file, "w") as f:
            for cut in cuts:
                self.check_entry(cut)
                data = {
                    "audio_filepath": cut.recording.sources[0].source,
                    "duration": cut.duration,
                    "lhotse_cut_id": cut.id,
                }
                for meta in ("text", "speaker", "gender", "language"):
                    if (item := getattr(cut.supervisions[0], meta)) is not None:
                        data[meta] = item
                if (custom := cut.supervisions[0].custom) is not None:
                    data.update(custom)
                print(json.dumps(data), file=f)

    def check_entry(self, cut) -> None:
        from lhotse import MonoCut

        assert isinstance(
            cut, MonoCut
        ), f"Currently, only MonoCut import is supported. Received: {cut}"
        assert (
            cut.has_recording
        ), f"Currently, we only support cuts with recordings. Received: {cut}"
        assert (
            cut.recording.num_channels == 1
        ), f"Currently, we only supports recordings with a single channel. Received: {cut}"
        assert (
            len(cut.recording.sources) == 1
        ), f"Currently, we only support recordings with a single AudioSource. Received: {cut}"
        assert (
            cut.recording.sources[0].type == "file"
        ), f"Currently, we only suppport AudioSources of type='file'. Received: {cut}"
        assert (
            len(cut.supervisions) == 1
        ), f"Currently, we only support cuts with a single supervision. Received: {cut}"