Intern ViT Runner#

class InternViTRunner : public trt_edgellm::rt::MultimodalRunner #

Runner for InternViT vision encoder.

This class handles the preprocessing and inference of InternViT vision encoder, which is part of the InternVL multimodal model.

Public Functions

InternViTRunner(std::string const &engineDir, cudaStream_t stream)#

Constructor for InternViTRunner.

Parameters:

engineDir – [in] Directory containing the TensorRT engine files
stream – [in] CUDA stream for execution

~InternViTRunner() = default#

virtual bool preprocess( rt::LLMGenerationRequest const &request, std::vector<std::vector<int32_t>> &batchedInputIds, tokenizer::Tokenizer *tokenizer, rt::Tensor &ropeRotaryCosSinDevice, cudaStream_t stream ) override#

Preprocess multimodal input including images and text.

Parameters:

request – [in] LLM generation request containing images and text
batchedInputIds – [inout] Batched input token IDs after preprocessing
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache
stream – [in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool infer(cudaStream_t stream) override#

Run inference on the vision encoder.

Parameters:: stream – [in] CUDA stream for execution
Returns:: True if inference succeeded, false otherwise

virtual bool validateAndFillConfig( std::string const &engineDir ) override#

Validate and load configuration from JSON file.

Parameters:: engineDir – [in] Path to engine directory
Returns:: True if configuration is valid and loaded successfully, false otherwise

virtual bool allocateBuffer(cudaStream_t stream) override#

Allocate buffers for inference.

Returns:: True if allocation succeeded, false otherwise

struct InternViTConfig#

Configuration for InternViT vision encoder.

Public Members

int64_t maxNumBlocks = {0}#: Maximum number of image blocks.

int64_t minNumBlocks = {0}#: Minimum number of image blocks.

int64_t numChannels = {0}#: Number of image channels (typically 3 for RGB)

int64_t outHiddenSize = {0}#: Output hidden dimension size.

int64_t patchSizeH = {0}#: Patch height in pixels.

int64_t patchSizeW = {0}#: Patch width in pixels.

int64_t blockImageSizeH = {0}#: Block image height.

int64_t blockImageSizeW = {0}#: Block image width.

int32_t vocabSize = {0}#: Vocabulary size.

int32_t imageTokenId = {0}#: Token ID for image placeholder.

int32_t imgStartTokenId = {151665}#

Token ID for.

tag

int32_t imgEndTokenId = {151666}#: Token ID for tag.

std::array<float, 3> imageMean = {{0.485F, 0.456F, 0.406F}}#: Image normalization mean values (RGB)

std::array<float, 3> imageStd = {{0.229F, 0.224F, 0.225F}}#: Image normalization standard deviation values (RGB)

int64_t minImageTokensPerImage = {0}#: Minimum image tokens generated by each image. Used for resizing.

int64_t maxImageTokensPerImage = {0}#: Maximum image tokens generated by each image. Used for resizing.