Qwen ViT Runner#

class QwenViTRunner : public trt_edgellm::rt::MultimodalRunner#

Runner for Qwen-VL vision encoder.

This class handles the preprocessing and inference of Qwen-VL vision encoder,

Public Functions

QwenViTRunner(
std::string const &engineDir,
int32_t llmMaxBatchSize,
int32_t llmMaxSequenceLength,
cudaStream_t stream
)#

Constructor for QwenViTRunner.

Parameters:
  • engineDir[in] Directory containing the TensorRT engine files

  • llmMaxBatchSize[in] Maximum batch size from LLM engine

  • llmMaxSequenceLength[in] Maximum sequence length from LLM engine

  • stream[in] CUDA stream for execution

~QwenViTRunner() = default#
virtual bool preprocess(
rt::LLMGenerationRequest const &request,
std::vector<std::vector<int32_t>> &batchedInputIds,
tokenizer::Tokenizer *tokenizer,
rt::Tensor &ropeRotaryCosSinDevice,
cudaStream_t stream
) override#

Preprocess multimodal input including images and text.

Parameters:
  • request[in] LLM generation request containing images and text

  • batchedInputIds[inout] Batched input token IDs after preprocessing

  • tokenizer[in] Tokenizer for text processing

  • ropeRotaryCosSinDevice[inout] RoPE rotary position encoding cache

  • stream[in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool preprocessSystemPrompt(
std::string const &systemPrompt,
tokenizer::Tokenizer *tokenizer,
rt::Tensor &ropeRotaryCosSinDevice,
cudaStream_t stream
) override#

Encode the system prompt and generate ND-RoPE parameters for the system prompt for KVCache saving.

Parameters:
  • systemPrompt[in] System prompt string

  • tokenizer[in] Tokenizer for text processing

  • ropeRotaryCosSinDevice[inout] RoPE rotary position encoding cache

  • stream[in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool infer(cudaStream_t stream) override#

Run inference on the vision encoder.

Parameters:

stream[in] CUDA stream for execution

Returns:

True if inference succeeded, false otherwise

virtual bool validateAndFillConfig(
std::string const &engineDir
) override#

Validate and load configuration from JSON file.

Parameters:

engineDir[in] Path to engine directory

Returns:

True if configuration is valid and loaded successfully, false otherwise

virtual bool allocateBuffer(cudaStream_t stream) override#

Allocate buffers for inference.

Returns:

True if allocation succeeded, false otherwise

virtual rt::OptionalInputTensors getExtraVisualFeatures() override#

Get extra visual features.

Returns:

Optional input tensors vector (e.g. deepstack features for Qwen3-VL)

struct QwenViTConfig#

Configuration for Qwen-VL vision encoder.

Public Members

int64_t maxHW = {0}#

Maximum height * width.

int64_t minHW = {0}#

Minimum height * width.

int64_t inputDim = {0}#

Input dimension.

int64_t vitPosEmbDim = {0}#

Vision transformer position embedding dimension.

int64_t outHiddenSize = {0}#

Output hidden dimension size.

int32_t vocabSize = {0}#

Vocabulary size.

int32_t visionStartTokenId = {0}#

Token ID for vision start.

int32_t imageTokenId = {0}#

Token ID for image placeholder.

int32_t videoTokenId = {0}#

Token ID for video placeholder.

float mropeTheta = {0}#

Multi-dimensional RoPE theta parameter.

int64_t patchSize = {0}#

Patch size in pixels.

int64_t temporalPatchSize = {0}#

Temporal patch size for video.

int64_t mergeSize = {0}#

Merge size for patches.

int64_t windowSize = {0}#

Window attention size used by Qwen2.5-VL.

int64_t numGridPerSide = {0}#

Number of grid per side for fast position embedding used by Qwen3-VL.

int64_t numDeepstackFeatures = {0}#

Number of deepstack features for Qwen3-VL.

int64_t minImageTokensPerImage = {0}#

Minimum image tokens generated by each image. Used for resizing.

int64_t maxImageTokensPerImage = {0}#

Maximum image tokens generated by each image. Used for resizing.

int64_t maxNumImages = {0}#

Maximum number of images per request. Used for pre-allocation.

std::vector<float> imageMean = {}#

Image normalization mean values (RGB)

std::vector<float> imageStd = {}#

Image normalization standard deviation values (RGB)