Nemotron Omni ViT Runner#
-
class NemotronOmniViTRunner : public trt_edgellm::rt::MultimodalRunner#
Runner for Nemotron-Omni RADIO vision encoder.
Handles preprocessing and inference for the RADIO ViT encoder used in Nemotron-Omni multimodal models. Images are tiled to an aspect-ratio-matched grid within the per-image tile budget, with a thumbnail appended when the grid produces more than one tile.
Public Functions
- NemotronOmniViTRunner(
- std::string const &engineDir,
- cudaStream_t stream
Constructor for NemotronOmniViTRunner.
- Parameters:
engineDir – [in] Directory containing the TensorRT engine files
stream – [in] CUDA stream for execution
- Throws:
std::runtime_error – if engineDir does not contain valid engine files
std::runtime_error – if buffer allocation fails
std::runtime_error – if a CUDA error occurs
-
~NemotronOmniViTRunner() noexcept = default#
- virtual bool preprocess(
- rt::LLMGenerationRequest const &request,
- std::vector<std::vector<int32_t>> &batchedInputIds,
- tokenizer::Tokenizer const *tokenizer,
- rt::Tensor &ropeRotaryCosSinDevice,
- cudaStream_t stream,
- bool imageOnly = false
Preprocess multimodal input including images and text.
- Parameters:
request – [in] LLM generation request containing images and text
batchedInputIds – [inout] Batched input token IDs after preprocessing
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache (unused by this model)
stream – [in] CUDA stream for execution
- Returns:
True if preprocessing succeeded, false otherwise
-
virtual bool infer(cudaStream_t stream) noexcept override#
Run inference on the vision encoder.
- Parameters:
stream – [in] CUDA stream for execution
- Returns:
True if inference succeeded, false otherwise
- virtual bool validateAndFillConfig(
- std::string const &engineDir
Validate and load configuration from JSON file.
- Parameters:
engineDir – [in] Path to engine directory
- Returns:
True if configuration is valid and loaded successfully, false otherwise
-
virtual bool allocateBuffer(cudaStream_t stream) override#
Allocate buffers for inference.
- Parameters:
stream – [in] CUDA stream for execution
- Throws:
std::runtime_error – if a CUDA error occurs
- Returns:
True if allocation succeeded, false otherwise
-
struct NemotronOmniViTConfig#
Configuration for Nemotron-Omni RADIO vision encoder.
Public Members
-
int64_t maxNumBlocks = {0}#
Maximum number of image tiles (across the batch)
-
int64_t minNumBlocks = {0}#
Minimum number of image tiles (across the batch)
-
int64_t numChannels = {3}#
Number of image channels (RGB)
-
int64_t outHiddenSize = {0}#
Output hidden dimension (LLM hidden size)
-
int64_t blockImageSizeH = {0}#
Image height per tile (preferred_resolution)
-
int64_t blockImageSizeW = {0}#
Image width per tile (preferred_resolution)
-
int64_t tokensPerBlock = {256}#
Tokens per tile after pixel shuffle.
-
int32_t vocabSize = {0}#
Vocabulary size (image token ID offset)
-
int32_t imgContextTokenId = {18}#
Image context token ID.
-
int32_t imgStartTokenId = {0}#
Image start token ID.
-
int32_t imgEndTokenId = {0}#
Image end token ID.
-
std::array<float, 3> imageMean = {{0.5F, 0.5F, 0.5F}}#
Image normalization mean.
-
std::array<float, 3> imageStd = {{0.5F, 0.5F, 0.5F}}#
Image normalization std.
-
int64_t minImageTokensPerImage = {0}#
Minimum image tokens generated by each image. Used for resizing.
-
int64_t maxImageTokensPerImage = {0}#
Maximum image tokens generated by each image. Used for resizing.
-
int64_t maxNumBlocks = {0}#