Nemotron Omni ViT Runner#

class NemotronOmniViTRunner : public trt_edgellm::rt::MultimodalRunner #

Runner for Nemotron-Omni RADIO vision encoder.

Handles preprocessing and inference for the RADIO ViT encoder used in Nemotron-Omni multimodal models. Images are tiled to an aspect-ratio-matched grid within the per-image tile budget, with a thumbnail appended when the grid produces more than one tile.

Public Functions

NemotronOmniViTRunner( std::string const &engineDir, cudaStream_t stream )#

Constructor for NemotronOmniViTRunner.

Parameters:

engineDir – [in] Directory containing the TensorRT engine files
stream – [in] CUDA stream for execution

Throws:

std::runtime_error – if engineDir does not contain valid engine files
std::runtime_error – if buffer allocation fails
std::runtime_error – if a CUDA error occurs

~NemotronOmniViTRunner() noexcept = default#

virtual bool preprocess( rt::LLMGenerationRequest const &request, std::vector<std::vector<int32_t>> &batchedInputIds, tokenizer::Tokenizer const *tokenizer, rt::Tensor &ropeRotaryCosSinDevice, cudaStream_t stream, bool imageOnly = false ) noexcept override#

Preprocess multimodal input including images and text.

Parameters:

request – [in] LLM generation request containing images and text
batchedInputIds – [inout] Batched input token IDs after preprocessing
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache (unused by this model)
stream – [in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool infer(cudaStream_t stream) noexcept override#

Run inference on the vision encoder.

Parameters:: stream – [in] CUDA stream for execution
Returns:: True if inference succeeded, false otherwise

virtual bool validateAndFillConfig( std::string const &engineDir ) override#

Validate and load configuration from JSON file.

Parameters:: engineDir – [in] Path to engine directory
Returns:: True if configuration is valid and loaded successfully, false otherwise

virtual bool allocateBuffer(cudaStream_t stream) override#

Allocate buffers for inference.

Parameters:: stream – [in] CUDA stream for execution
Throws:: std::runtime_error – if a CUDA error occurs
Returns:: True if allocation succeeded, false otherwise

struct NemotronOmniViTConfig#

Configuration for Nemotron-Omni RADIO vision encoder.

Public Members

int64_t maxNumBlocks = {0}#: Maximum number of image tiles (across the batch)

int64_t minNumBlocks = {0}#: Minimum number of image tiles (across the batch)

int64_t numChannels = {3}#: Number of image channels (RGB)

int64_t outHiddenSize = {0}#: Output hidden dimension (LLM hidden size)

int64_t blockImageSizeH = {0}#: Image height per tile (preferred_resolution)

int64_t blockImageSizeW = {0}#: Image width per tile (preferred_resolution)

int64_t tokensPerBlock = {256}#: Tokens per tile after pixel shuffle.

int32_t vocabSize = {0}#: Vocabulary size (image token ID offset)

int32_t imgContextTokenId = {18}#: Image context token ID.

int32_t imgStartTokenId = {0}#: Image start token ID.

int32_t imgEndTokenId = {0}#: Image end token ID.

std::array<float, 3> imageMean = {{0.5F, 0.5F, 0.5F}}#: Image normalization mean.

std::array<float, 3> imageStd = {{0.5F, 0.5F, 0.5F}}#: Image normalization std.

int64_t minImageTokensPerImage = {0}#: Minimum image tokens generated by each image. Used for resizing.

int64_t maxImageTokensPerImage = {0}#: Maximum image tokens generated by each image. Used for resizing.