Nemotron Omni Audio Runner#
-
struct NemotronOmniAudioConfig#
Configuration for Nemotron-Omni Parakeet audio encoder.
Public Members
-
int32_t melBins = {0}#
Number of mel-frequency bins.
-
int32_t audioFeatureDim = {0}#
Audio feature dimension (LLM hidden size)
-
int32_t subsamplingFactor = {0}#
Parakeet subsampling factor.
-
int32_t samplingRate = {0}#
Audio sampling rate.
-
int32_t soundContextTokenId = {0}#
<so_embedding> token ID
-
int32_t vocabSize = {0}#
Vocabulary size (audio token ID offset)
-
int32_t melBins = {0}#
-
bool trt_edgellm::rt::resizeEmbeddingForRows(int64_t rows)#
Runner for Nemotron-Omni Parakeet audio encoder.
The encoder is exported and built at fixed batch=1: the Conformer’s depthwise convolution is a local operator that ignores attention masks, so cross-clip batching with padding silently corrupts short-clip outputs at their right edge.
preprocess()walks every audio clip in the ! request batch and invokes the encoder once per clip, writing the encoded ! rows directly intomAudioEmbeddingat sequential offsets.infer()! is a no-op. class NemotronOmniAudioRunner : public MultimodalRunner{ public: ! Constructor for NemotronOmniAudioRunner !
~NemotronOmniAudioRunner() noexcept = default;
! Preprocess multimodal input including audio and text !
! Run inference on the audio encoder !
! Validate and load configuration from JSON file !
! Allocate buffers for inference !
! Get audio embeddings from encoder output !
private: ! Load pre-computed mel-spectrogram from file !
! Encode all audio clips in the request batch into mAudioEmbedding. ! ! Iteratesrequest.requests(and each request’s audioBuffers) in ! placeholder order, runs the bs=1 encoder once per clip, and packs the ! valid rows of every clip into a contiguous[totalRows, hidden]! buffer.audioTokenLengthsis filled in the same order so !textPreprocesscan replace each<so_embedding>placeholder ! with the right number of audio tokens. bool encodeAllClips(
rt::LLMGenerationRequest const& request, std::vector<int64_t>& audioTokenLengths, cudaStream_t stream);
! Encode a single mel-spectrogram clip into mAudioEmbedding at ! the specified row offset. Returns the number of encoded rows. bool encodeSingleClip(rt::Tensor const& mel, int64_t destRowOffset, int64_t& encodedRowsOut, cudaStream_t stream);
! Resize mAudioEmbedding to
[rows, hiddenDim], reallocating if the existing buffer cannot hold that many rows.- Parameters:
engineDir – [in] Directory containing the audio encoder engine !
stream – [in] CUDA stream for execution !
request – [in] LLM generation request containing audio and text !
batchedInputIds – [inout] Batched input token IDs after preprocessing !
tokenizer – [in] Tokenizer for text processing !
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache (unused by this model) !
stream – [in] CUDA stream for execution !
stream – [in] CUDA stream for execution !
engineDir – [in] Path to engine directory !
stream – [in] CUDA stream for execution !
filePath – [in] Path to .npy or .raw file !
format – [in] File format: “npy” or “raw” !
melSpectrogram – [out] Output tensor [1, seq_len, mel_bins] !
stream – [in] CUDA stream for execution !
- Throws:
std::runtime_error – if engine loading fails or configuration is invalid NemotronOmniAudioRunner(std::string const& engineDir, cudaStream_t stream);
- Returns:
True if preprocessing succeeded, false otherwise bool preprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream, bool imageOnly = false) override;
- Returns:
True if inference succeeded, false otherwise bool infer(cudaStream_t stream) override;
- Returns:
True if configuration is valid and loaded successfully, false otherwise bool validateAndFillConfig(std::string const& engineDir) override;
- Returns:
True if allocation succeeded, false otherwise bool allocateBuffer(cudaStream_t stream) override;
- Returns:
Reference to audio embedding tensor rt::Tensor& getOutputEmbedding() override;
- Returns:
True on success, false otherwise bool loadMelSpectrogramFromFile(
std::string const& filePath, std::string const& format, rt::Tensor& melSpectrogram, cudaStream_t stream);
- void trt_edgellm::rt::textPreprocess(
- rt::LLMGenerationRequest const &request,
- std::vector<std::vector<int32_t>> &batchInputIds,
- std::vector<int64_t> const &audioTokenLengths,
- tokenizer::Tokenizer const *tokenizer
Tokenize text and insert audio placeholder tokens.
- Parameters:
request – [in] LLM generation request
batchInputIds – [out] Batched input IDs after tokenization and audio token insertion
audioTokenLengths – [in] Token lengths for each audio clip
tokenizer – [in] Tokenizer for text encoding