Qwen ViT Runner#
-
class QwenViTRunner : public trt_edgellm::rt::MultimodalRunner#
Runner for Qwen-VL vision encoder.
This class handles the preprocessing and inference of Qwen-VL vision encoder,
Public Functions
- QwenViTRunner(
- std::string const &engineDir,
- int32_t llmMaxBatchSize,
- int32_t llmMaxSequenceLength,
- cudaStream_t stream
Constructor for QwenViTRunner.
- Parameters:
engineDir – [in] Directory containing the TensorRT engine files
llmMaxBatchSize – [in] Maximum batch size from LLM engine
llmMaxSequenceLength – [in] Maximum sequence length from LLM engine
stream – [in] CUDA stream for execution
-
~QwenViTRunner() = default#
- virtual bool preprocess(
- rt::LLMGenerationRequest const &request,
- std::vector<std::vector<int32_t>> &batchedInputIds,
- tokenizer::Tokenizer *tokenizer,
- rt::Tensor &ropeRotaryCosSinDevice,
- cudaStream_t stream
Preprocess multimodal input including images and text.
- Parameters:
request – [in] LLM generation request containing images and text
batchedInputIds – [inout] Batched input token IDs after preprocessing
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache
stream – [in] CUDA stream for execution
- Returns:
True if preprocessing succeeded, false otherwise
- virtual bool preprocessSystemPrompt(
- std::string const &systemPrompt,
- tokenizer::Tokenizer *tokenizer,
- rt::Tensor &ropeRotaryCosSinDevice,
- cudaStream_t stream
Encode the system prompt and generate ND-RoPE parameters for the system prompt for KVCache saving.
- Parameters:
systemPrompt – [in] System prompt string
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache
stream – [in] CUDA stream for execution
- Returns:
True if preprocessing succeeded, false otherwise
-
virtual bool infer(cudaStream_t stream) override#
Run inference on the vision encoder.
- Parameters:
stream – [in] CUDA stream for execution
- Returns:
True if inference succeeded, false otherwise
- virtual bool validateAndFillConfig(
- std::string const &engineDir
Validate and load configuration from JSON file.
- Parameters:
engineDir – [in] Path to engine directory
- Returns:
True if configuration is valid and loaded successfully, false otherwise
-
virtual bool allocateBuffer(cudaStream_t stream) override#
Allocate buffers for inference.
- Returns:
True if allocation succeeded, false otherwise
-
struct QwenViTConfig#
Configuration for Qwen-VL vision encoder.
Public Members
-
int64_t maxHW = {0}#
Maximum height * width.
-
int64_t minHW = {0}#
Minimum height * width.
-
int64_t inputDim = {0}#
Input dimension.
-
int64_t vitPosEmbDim = {0}#
Vision transformer position embedding dimension.
-
int64_t outHiddenSize = {0}#
Output hidden dimension size.
-
int32_t vocabSize = {0}#
Vocabulary size.
-
int32_t visionStartTokenId = {0}#
Token ID for vision start.
-
int32_t imageTokenId = {0}#
Token ID for image placeholder.
-
int32_t videoTokenId = {0}#
Token ID for video placeholder.
-
float mropeTheta = {0}#
Multi-dimensional RoPE theta parameter.
-
int64_t patchSize = {0}#
Patch size in pixels.
-
int64_t temporalPatchSize = {0}#
Temporal patch size for video.
-
int64_t mergeSize = {0}#
Merge size for patches.
-
int64_t windowSize = {0}#
Window attention size used by Qwen2.5-VL.
-
int64_t numGridPerSide = {0}#
Number of grid per side for fast position embedding used by Qwen3-VL.
-
int64_t numDeepstackFeatures = {0}#
Number of deepstack features for Qwen3-VL.
-
int64_t minImageTokensPerImage = {0}#
Minimum image tokens generated by each image. Used for resizing.
-
int64_t maxImageTokensPerImage = {0}#
Maximum image tokens generated by each image. Used for resizing.
-
int64_t maxNumImages = {0}#
Maximum number of images per request. Used for pre-allocation.
-
std::vector<float> imageMean = {}#
Image normalization mean values (RGB)
-
std::vector<float> imageStd = {}#
Image normalization standard deviation values (RGB)
-
int64_t maxHW = {0}#