Nemotron Omni Audio Runner#

struct NemotronOmniAudioConfig#

Configuration for Nemotron-Omni Parakeet audio encoder.

Public Members

int32_t melBins = {0}#: Number of mel-frequency bins.

int32_t audioFeatureDim = {0}#: Audio feature dimension (LLM hidden size)

int32_t subsamplingFactor = {0}#: Parakeet subsampling factor.

int32_t soundContextTokenId = {0}#: <so_embedding> token ID

bool trt_edgellm::rt::produceClipMel( ClipPlan &plan, rt::Tensor &melSpec, cudaStream_t stream, )#

Runner for Nemotron-Omni Parakeet audio encoder.

The encoder is exported and built at fixed batch=1: the Conformer’s depthwise convolution is a local operator that ignores attention masks, so cross-clip batching with padding silently corrupts short-clip outputs at their right edge. preprocess() walks every audio clip in the ! request batch and invokes the encoder once per clip, writing the encoded ! rows directly into mAudioEmbedding at sequential offsets. infer() ! is a no-op. class NemotronOmniAudioRunner : public MultimodalRunner

{ public: ! Constructor for NemotronOmniAudioRunner !

~NemotronOmniAudioRunner() noexcept = default;

! Preprocess multimodal input including audio and text !

! Run inference on the audio encoder !

! Validate and load configuration from JSON file !

! Allocate buffers for inference !

! Get audio embeddings from encoder output !

private: ! Encode all audio clips in the request batch into mAudioEmbedding. ! ! Iterates

request.requests (and each request’s audioBuffers) in ! placeholder order, runs the bs=1 encoder once per clip, and packs the ! valid rows of every clip into a contiguous [totalRows, hidden] ! buffer. audioTokenLengths is filled in the same order so ! textPreprocess can replace each <so_embedding>

placeholder ! with the right number of audio tokens. bool encodeAllClips(

rt::LLMGenerationRequest const& request, std::vector<int64_t>& audioTokenLengths, cudaStream_t stream);

! Encode a single mel-spectrogram clip into mAudioEmbedding at ! the specified row offset. Returns the number of encoded rows. bool encodeSingleClip(rt::Tensor const& mel, int64_t destRowOffset, int64_t& encodedRowsOut, cudaStream_t stream);

! Resize mAudioEmbedding to [rows, hiddenDim], reallocating ! if the existing buffer cannot hold that many rows. bool resizeEmbeddingForRows(int64_t rows);

! Tokenize text and insert audio placeholder tokens !

! Build the online GPU fbank resources (mel filter / window / FFT ! twiddle + scalar params) from the bound CPU MelExtractor’s config, ! validated parakeet-spec. Best-effort: returns false (→ CPU fallback) ! when the extractor isn’t parakeet-spec, the CuTe DSL GEMM is ! unavailable for this device/build, or a resource upload fails. bool initFbankResources(cudaStream_t stream);

! Whether the online GPU fbank can run for this clip: resources ready, ! engine mel width agrees, and the sample rate matches what the kernels ! assume. Checked in pass 1 of encodeAllClips before committing a clip ! to the GPU path. bool gpuFbankViable(rt::audio::AudioPCM const& pcm) const;

! Pass 2 GPU fbank for one clip: upload PCM and run fbankParakeet into a ! fresh [1, numFrames, mel_bins] FP16 melSpec

. Returns false on ! upload / kernel failure so the caller can fall back to the CPU ! extractor (which yields the same frame count, keeping pass-1 sizing valid). bool runGpuFbankClip(

rt::audio::AudioPCM const& pcm, int64_t const numFrames, rt::Tensor& melSpec, cudaStream_t stream);

! Per-clip mel plan: built in pass 1 of encodeAllClips (frame count + ! chosen path), consumed in pass 2 by produceClipMel. struct ClipPlan { rt::audio::AudioPCM const* pcm{nullptr}; //!< non-null → run GPU fbank in pass 2 rt::Tensor hostMel{}; //!< valid → CPU mel already extracted (upload in pass 2) int64_t numFrames{0}; //!< T (shapes the GPU melSpec view) };

! Pass 2: produce one clip’s [1, T, mel_bins] FP16 GPU mel from its

plan — GPU fbank when planned (falling back to a CPU re-extract on runtime failure), otherwise upload the pass-1 CPU mel. The fallback yields the same floor(N/hop) frame count, keeping pass-1 sizing valid.

Parameters:

engineDir – [in] Directory containing the audio encoder engine !
stream – [in] CUDA stream for execution !
request – [in] LLM generation request containing audio and text !
batchedInputIds – [inout] Batched input token IDs after preprocessing !
tokenizer – [in] Tokenizer for text processing !
mropeCosSinOut – [inout] MRoPE cos/sin output tensor (unused by this model) !
stream – [in] CUDA stream for execution !
stream – [in] CUDA stream for execution !
engineDir – [in] Path to engine directory !
stream – [in] CUDA stream for execution !
request – [in] LLM generation request !
batchInputIds – [out] Batched input IDs after tokenization and audio token insertion !
audioTokenLengths – [in] Token lengths for each audio clip !
tokenizer – [in]
Tokenizer for text encoding void textPreprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchInputIds,

std::vector<int64_t> const& audioTokenLengths, tokenizer::Tokenizer const* tokenizer);

Throws:

std::runtime_error – if engine loading fails or configuration is invalid NemotronOmniAudioRunner(std::string const& engineDir, cudaStream_t stream);

Returns:

True if preprocessing succeeded, false otherwise bool preprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::OptionalOutputTensor mropeCosSinOut, cudaStream_t stream, bool imageOnly = false) override;

Returns:

True if inference succeeded, false otherwise bool infer(cudaStream_t stream) override;

Returns:

True if configuration is valid and loaded successfully, false otherwise bool validateAndFillConfig(std::string const& engineDir) override;

Returns:

True if allocation succeeded, false otherwise bool allocateBuffer(cudaStream_t stream) override;

Returns:

Reference to audio embedding tensor rt::Tensor& getOutputEmbedding() override;