Source code for sdp.processors.inference.asr.transformers.speech_recognition

# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from pathlib import Path

from tqdm import tqdm

from sdp.logging import logger
from sdp.processors.base_processor import BaseProcessor
from sdp.utils.common import load_manifest
from typing import Optional


[docs]
class ASRTransformers(BaseProcessor):
    """This processor transcribes audio files using HuggingFace ASR Transformer models.

    It processes audio files from the manifest and adds transcriptions using the specified
    pre-trained model from HuggingFace.

    Args:
        pretrained_model (str): Name of pretrained model on HuggingFace.
        output_text_key (str): Key to save transcription result in the manifest.
        input_audio_key (str): Key to read audio file paths from the manifest. Default: "audio_filepath".
        input_duration_key (str): Key for audio duration in the manifest. Default: "duration".
        device (str): Inference device (e.g., "cuda", "cpu"). Default: None.
        batch_size (int): Inference batch size. Default: 1.
        chunk_length_s (int): Length of audio chunks in seconds. Default: 0.
        torch_dtype (str): Tensor data type for model inference. Default: "float32".
        generate_task (str): Task type for generation. Default: "transcribe".
        generate_language (str): Language for generation. Default: "english".
        max_new_tokens (int, Optional): Maximum number of new tokens to generate. Default: None.

    Returns:
        A manifest with transcribed text added to each entry under the specified output_text_key.

    """

    def __init__(
        self,
        pretrained_model: str,
        output_text_key: str,
        input_audio_key: str = "audio_filepath",
        input_duration_key: str = "duration",
        device: str = None,
        batch_size: int = 1,
        chunk_length_s: int = 0,
        torch_dtype: str = "float32",
        generate_task: str = "transcribe",
        generate_language: str = "english",
        max_new_tokens: Optional[int] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        try:
            import torch
            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
        except:
            raise ImportError("Need to install transformers: pip install accelerate transformers")

        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
        self.pretrained_model = pretrained_model
        self.input_audio_key = input_audio_key
        self.output_text_key = output_text_key
        self.input_duration_key = input_duration_key
        self.device = device
        self.batch_size = batch_size
        self.chunk_length_s = chunk_length_s
        self.generate_task = generate_task
        self.generate_language = generate_language
        self.max_new_tokens = max_new_tokens
        if torch_dtype == "float32":
            self.torch_dtype = torch.float32
        elif torch_dtype == "float16":
            self.torch_dtype = torch.float16
        else:
            raise NotImplementedError(torch_dtype + " is not implemented!")

        if self.device is None:
            if torch.cuda.is_available():
                self.device = "cuda:0"
            else:
                self.device = "cpu"

        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
        )
        self.model.to(self.device)

        # Check if using Whisper/Seamless or NVIDIA model based on the model name
        self.is_whisper_or_seamless = any(x in self.pretrained_model.lower() for x in ['whisper', 'seamless'])
        
        # Only set language in generation config for Whisper/Seamless models
        if self.is_whisper_or_seamless and self.generate_language:
            self.model.generation_config.language = self.generate_language

        processor = AutoProcessor.from_pretrained(self.pretrained_model)

        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            max_new_tokens=self.max_new_tokens,
            chunk_length_s=self.chunk_length_s,
            batch_size=self.batch_size,
            return_timestamps=self.is_whisper_or_seamless,  # Only set return_timestamps for Whisper/Seamless models
            torch_dtype=self.torch_dtype,
            device=self.device,
        )

    def process(self):
        json_list = load_manifest(Path(self.input_manifest_file))
        json_list_sorted = sorted(json_list, key=lambda d: d[self.input_duration_key], reverse=True)

        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)

        with Path(self.output_manifest_file).open("w") as f:
            start_index = 0
            for _ in tqdm(range(len(json_list_sorted) // self.batch_size)):
                batch = json_list_sorted[start_index : start_index + self.batch_size]
                start_index += self.batch_size
                audio_files = [item[self.input_audio_key] for item in batch]
                
                # Only pass generate_kwargs for Whisper/Seamless models
                if self.is_whisper_or_seamless and self.generate_language and self.generate_task:
                    results = self.pipe(
                        audio_files, generate_kwargs={"language": self.generate_language, "task": self.generate_task}
                    )
                else:
                    results = self.pipe(audio_files)

                for i, item in enumerate(batch):
                    item[self.output_text_key] = results[i]["text"]
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")