Nemotron Omni Audio Runner#

struct NemotronOmniAudioConfig#

Configuration for Nemotron-Omni Parakeet audio encoder.

Public Members

int32_t melBins = {0}#: Number of mel-frequency bins.

int32_t audioFeatureDim = {0}#: Audio feature dimension (LLM hidden size)

int32_t subsamplingFactor = {0}#: Parakeet subsampling factor.

int32_t samplingRate = {0}#: Audio sampling rate.

int32_t soundContextTokenId = {0}#: <so_embedding> token ID

int32_t vocabSize = {0}#: Vocabulary size (audio token ID offset)

bool trt_edgellm::rt::resizeEmbeddingForRows(int64_t rows)#

Runner for Nemotron-Omni Parakeet audio encoder.

The encoder is exported and built at fixed batch=1: the Conformer’s depthwise convolution is a local operator that ignores attention masks, so cross-clip batching with padding silently corrupts short-clip outputs at their right edge. preprocess() walks every audio clip in the ! request batch and invokes the encoder once per clip, writing the encoded ! rows directly into mAudioEmbedding at sequential offsets. infer() ! is a no-op. class NemotronOmniAudioRunner : public MultimodalRunner

{ public: ! Constructor for NemotronOmniAudioRunner !

~NemotronOmniAudioRunner() noexcept = default;

! Preprocess multimodal input including audio and text !

! Run inference on the audio encoder !

! Validate and load configuration from JSON file !

! Allocate buffers for inference !

! Get audio embeddings from encoder output !

private: ! Load pre-computed mel-spectrogram from file !

! Encode all audio clips in the request batch into mAudioEmbedding. ! ! Iterates request.requests (and each request’s audioBuffers) in ! placeholder order, runs the bs=1 encoder once per clip, and packs the ! valid rows of every clip into a contiguous [totalRows, hidden] ! buffer. audioTokenLengths is filled in the same order so ! textPreprocess can replace each <so_embedding>

placeholder ! with the right number of audio tokens. bool encodeAllClips(

rt::LLMGenerationRequest const& request, std::vector<int64_t>& audioTokenLengths, cudaStream_t stream);

! Encode a single mel-spectrogram clip into mAudioEmbedding at ! the specified row offset. Returns the number of encoded rows. bool encodeSingleClip(rt::Tensor const& mel, int64_t destRowOffset, int64_t& encodedRowsOut, cudaStream_t stream);

! Resize mAudioEmbedding to [rows, hiddenDim], reallocating if the existing buffer cannot hold that many rows.

Parameters:

engineDir – [in] Directory containing the audio encoder engine !
stream – [in] CUDA stream for execution !
request – [in] LLM generation request containing audio and text !
batchedInputIds – [inout] Batched input token IDs after preprocessing !
tokenizer – [in] Tokenizer for text processing !
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache (unused by this model) !
stream – [in] CUDA stream for execution !
stream – [in] CUDA stream for execution !
engineDir – [in] Path to engine directory !
stream – [in] CUDA stream for execution !
filePath – [in] Path to .npy or .raw file !
format – [in] File format: “npy” or “raw” !
melSpectrogram – [out] Output tensor [1, seq_len, mel_bins] !
stream – [in] CUDA stream for execution !

Throws:

std::runtime_error – if engine loading fails or configuration is invalid NemotronOmniAudioRunner(std::string const& engineDir, cudaStream_t stream);

Returns:

True if preprocessing succeeded, false otherwise bool preprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream, bool imageOnly = false) override;

Returns:

True if inference succeeded, false otherwise bool infer(cudaStream_t stream) override;

Returns:

True if configuration is valid and loaded successfully, false otherwise bool validateAndFillConfig(std::string const& engineDir) override;

Returns:

True if allocation succeeded, false otherwise bool allocateBuffer(cudaStream_t stream) override;

Returns:

Reference to audio embedding tensor rt::Tensor& getOutputEmbedding() override;

Returns:

True on success, false otherwise bool loadMelSpectrogramFromFile(

std::string const& filePath, std::string const& format, rt::Tensor& melSpectrogram, cudaStream_t stream);

void trt_edgellm::rt::textPreprocess( rt::LLMGenerationRequest const &request, std::vector<std::vector<int32_t>> &batchInputIds, std::vector<int64_t> const &audioTokenLengths, tokenizer::Tokenizer const *tokenizer )#

Tokenize text and insert audio placeholder tokens.

Parameters:

request – [in] LLM generation request
batchInputIds – [out] Batched input IDs after tokenization and audio token insertion
audioTokenLengths – [in] Token lengths for each audio clip
tokenizer – [in] Tokenizer for text encoding