Source code for sdp.processors.datasets.lhotse

# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json

from sdp.processors.base_processor import BaseProcessor


[docs] class LhotseImport(BaseProcessor): """Processor to create an initial manifest imported from a Lhotse CutSet. The ``input_manifest_file`` is expected to point to a Lhotse CutSet manifest, which usually has ``cuts`` in its name and a ``.jsonl`` or ``.jsonl.gz`` extension. Lhotse is a library for speech data processing and loading; see: * https://github.com/lhotse-speech/lhotse * https://lhotse.readthedocs.io It can be installed using ``pip install lhotse``. .. caution:: Currently we only support the importing of cut sets that represent single-channel, single-audio-file-per-utterance datasets. Returns: This processor generates an initial manifest file with the following fields:: { "audio_filepath": <path to the audio file>, "duration": <duration of the audio in seconds>, "text": <transcription (with capitalization and punctuation)>, } """ def process(self): from lhotse import CutSet cuts = CutSet.from_file(self.input_manifest_file) with open(self.output_manifest_file, "w") as f: for cut in cuts: self.check_entry(cut) data = { "audio_filepath": cut.recording.sources[0].source, "duration": cut.duration, "lhotse_cut_id": cut.id, } for meta in ("text", "speaker", "gender", "language"): if (item := getattr(cut.supervisions[0], meta)) is not None: data[meta] = item if (custom := cut.supervisions[0].custom) is not None: data.update(custom) print(json.dumps(data), file=f) def check_entry(self, cut) -> None: from lhotse import MonoCut assert isinstance( cut, MonoCut ), f"Currently, only MonoCut import is supported. Received: {cut}" assert ( cut.has_recording ), f"Currently, we only support cuts with recordings. Received: {cut}" assert ( cut.recording.num_channels == 1 ), f"Currently, we only supports recordings with a single channel. Received: {cut}" assert ( len(cut.recording.sources) == 1 ), f"Currently, we only support recordings with a single AudioSource. Received: {cut}" assert ( cut.recording.sources[0].type == "file" ), f"Currently, we only suppport AudioSources of type='file'. Received: {cut}" assert ( len(cut.supervisions) == 1 ), f"Currently, we only support cuts with a single supervision. Received: {cut}"