Qwen ViT Runner#

class QwenViTRunner : public trt_edgellm::rt::MultimodalRunner #

Runner for Qwen-VL vision encoder.

This class handles the preprocessing and inference of Qwen-VL vision encoder,

Public Functions

QwenViTRunner( std::string const &engineDir, int32_t llmMaxBatchSize, int32_t llmMaxSequenceLength, cudaStream_t stream )#

Constructor for QwenViTRunner.

Parameters:

engineDir – [in] Directory containing the TensorRT engine files
llmMaxBatchSize – [in] Maximum batch size from LLM engine
llmMaxSequenceLength – [in] Maximum sequence length from LLM engine
stream – [in] CUDA stream for execution

~QwenViTRunner() = default#

virtual bool preprocess( rt::LLMGenerationRequest const &request, std::vector<std::vector<int32_t>> &batchedInputIds, tokenizer::Tokenizer *tokenizer, rt::Tensor &ropeRotaryCosSinDevice, cudaStream_t stream ) override#

Preprocess multimodal input including images and text.

Parameters:

request – [in] LLM generation request containing images and text
batchedInputIds – [inout] Batched input token IDs after preprocessing
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache
stream – [in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool preprocessSystemPrompt( std::string const &systemPrompt, tokenizer::Tokenizer *tokenizer, rt::Tensor &ropeRotaryCosSinDevice, cudaStream_t stream ) override#

Encode the system prompt and generate ND-RoPE parameters for the system prompt for KVCache saving.

Parameters:

systemPrompt – [in] System prompt string
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache
stream – [in] CUDA stream for execution

Returns:

True if preprocessing succeeded, false otherwise

virtual bool infer(cudaStream_t stream) override#

Run inference on the vision encoder.

Parameters:: stream – [in] CUDA stream for execution
Returns:: True if inference succeeded, false otherwise

virtual bool validateAndFillConfig( std::string const &engineDir ) override#

Validate and load configuration from JSON file.

Parameters:: engineDir – [in] Path to engine directory
Returns:: True if configuration is valid and loaded successfully, false otherwise

virtual bool allocateBuffer(cudaStream_t stream) override#

Allocate buffers for inference.

Returns:: True if allocation succeeded, false otherwise

virtual rt::OptionalInputTensors getExtraVisualFeatures() override#

Get extra visual features.

Returns:: Optional input tensors vector (e.g. deepstack features for Qwen3-VL)

struct QwenViTConfig#

Configuration for Qwen-VL vision encoder.

Public Members

int64_t maxHW = {0}#: Maximum height * width.

int64_t minHW = {0}#: Minimum height * width.

int64_t inputDim = {0}#: Input dimension.

int64_t vitPosEmbDim = {0}#: Vision transformer position embedding dimension.

int64_t outHiddenSize = {0}#: Output hidden dimension size.

int32_t vocabSize = {0}#: Vocabulary size.

int32_t visionStartTokenId = {0}#: Token ID for vision start.

int32_t imageTokenId = {0}#: Token ID for image placeholder.

int32_t videoTokenId = {0}#: Token ID for video placeholder.

float mropeTheta = {0}#: Multi-dimensional RoPE theta parameter.

int64_t patchSize = {0}#: Patch size in pixels.

int64_t temporalPatchSize = {0}#: Temporal patch size for video.

int64_t mergeSize = {0}#: Merge size for patches.

int64_t windowSize = {0}#: Window attention size used by Qwen2.5-VL.

int64_t numGridPerSide = {0}#: Number of grid per side for fast position embedding used by Qwen3-VL.

int64_t numDeepstackFeatures = {0}#: Number of deepstack features for Qwen3-VL.

int64_t minImageTokensPerImage = {0}#: Minimum image tokens generated by each image. Used for resizing.

int64_t maxImageTokensPerImage = {0}#: Maximum image tokens generated by each image. Used for resizing.

int64_t maxNumImages = {0}#: Maximum number of images per request. Used for pre-allocation.

std::vector<float> imageMean = {}#: Image normalization mean values (RGB)

std::vector<float> imageStd = {}#: Image normalization standard deviation values (RGB)