Intern ViT Runner#

class InternViTRunner : public trt_edgellm::rt::MultimodalRunner#

Runner for InternViT vision encoder.

This class handles the preprocessing and inference of InternViT vision encoder, which is part of the InternVL multimodal model.

Public Functions

InternViTRunner(std::string const &engineDir, cudaStream_t stream)#

Constructor for InternViTRunner.

Parameters:
  • engineDir[in] Directory containing the TensorRT engine files

  • stream[in] CUDA stream for execution

~InternViTRunner() = default#
virtual bool preprocess(
rt::LLMGenerationRequest const &request,
std::vector<std::vector<int32_t>> &batchedInputIds,
tokenizer::Tokenizer *tokenizer,
rt::Tensor &ropeRotaryCosSinDevice,
cudaStream_t stream
) override#

Preprocess multimodal input including images and text.

Parameters:
  • request[in] LLM generation request containing images and text

  • batchedInputIds[inout] Batched input token IDs after preprocessing

  • tokenizer[in] Tokenizer for text processing

  • ropeRotaryCosSinDevice[inout] RoPE rotary position encoding cache

  • stream[in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool infer(cudaStream_t stream) override#

Run inference on the vision encoder.

Parameters:

stream[in] CUDA stream for execution

Returns:

True if inference succeeded, false otherwise

virtual bool validateAndFillConfig(
std::string const &engineDir
) override#

Validate and load configuration from JSON file.

Parameters:

engineDir[in] Path to engine directory

Returns:

True if configuration is valid and loaded successfully, false otherwise

virtual bool allocateBuffer(cudaStream_t stream) override#

Allocate buffers for inference.

Returns:

True if allocation succeeded, false otherwise

struct InternViTConfig#

Configuration for InternViT vision encoder.

Public Members

int64_t maxNumBlocks = {0}#

Maximum number of image blocks.

int64_t minNumBlocks = {0}#

Minimum number of image blocks.

int64_t numChannels = {0}#

Number of image channels (typically 3 for RGB)

int64_t outHiddenSize = {0}#

Output hidden dimension size.

int64_t patchSizeH = {0}#

Patch height in pixels.

int64_t patchSizeW = {0}#

Patch width in pixels.

int64_t blockImageSizeH = {0}#

Block image height.

int64_t blockImageSizeW = {0}#

Block image width.

int32_t vocabSize = {0}#

Vocabulary size.

int32_t imageTokenId = {0}#

Token ID for image placeholder.

int32_t imgStartTokenId = {151665}#

Token ID for.

tag

int32_t imgEndTokenId = {151666}#

Token ID for tag.

std::array<float, 3> imageMean = {{0.485F, 0.456F, 0.406F}}#

Image normalization mean values (RGB)

std::array<float, 3> imageStd = {{0.229F, 0.224F, 0.225F}}#

Image normalization standard deviation values (RGB)

int64_t minImageTokensPerImage = {0}#

Minimum image tokens generated by each image. Used for resizing.

int64_t maxImageTokensPerImage = {0}#

Maximum image tokens generated by each image. Used for resizing.