Gemma4 ViT Runner#

class Gemma4ViTRunner : public trt_edgellm::rt::MultimodalRunner#

Runner for Gemma4 vision encoder.

Public Functions

Gemma4ViTRunner(std::string const &engineDir, cudaStream_t stream)#

Constructor for Gemma4ViTRunner.

~Gemma4ViTRunner() noexcept = default#
virtual bool preprocess(
rt::LLMGenerationRequest const &request,
std::vector<std::vector<int32_t>> &batchedInputIds,
tokenizer::Tokenizer const *tokenizer,
rt::OptionalOutputTensor mropeCosSinOut,
cudaStream_t stream,
bool imageOnly = false
) noexcept override#

Preprocess request with images and text.

Parameters:
  • request – Generation request with prompts and images

  • batchedInputIds – Output batched input token IDs

  • tokenizer – Tokenizer instance

  • mropeCosSinOut – Output MRope cos/sin cache. Required (has_value() == true) only for MRope-based multimodal runners (QwenViT, Qwen3OmniAudio), which write per-batch 3D position encodings into it. Pass std::nullopt when the base engine uses standard RoPE — runners with standard RoPE (InternViT, Phi4MMViT) do not read this parameter.

  • stream – CUDA stream

  • imageOnly – When true, only run image preprocessing (skip text tokenization and RoPE generation). Used for benchmarking where only the visual engine inputs need to be set up.

Returns:

True on success, false on failure

virtual bool infer(cudaStream_t stream) noexcept override#

Run multimodal inference.

Parameters:

stream – CUDA stream

Returns:

True on success, false on failure

virtual bool validateAndFillConfig(
std::string const &engineDir
) override#

Validate and fill configuration from file.

Parameters:

engineDir – Path to engine directory

Returns:

True on success, false on failure

virtual bool allocateBuffer(cudaStream_t stream) override#

Allocate device buffers.

Returns:

True on success, false on failure

struct Gemma4ViTConfig#

Configuration for Gemma4 vision encoder preprocessing and runtime bindings.

Public Members

int64_t maxPatches = {0}#

Maximum number of visual patches in a request.

int64_t minPatches = {0}#

Minimum number of visual patches in a request.

int64_t inputDim = {0}#

Patch input dimension.

int64_t outHiddenSize = {0}#

Output hidden dimension size.

int64_t patchSize = {16}#

Patch size in pixels.

int64_t poolingKernelSize = {3}#

Vision pooler spatial kernel size.

int64_t maxImageTokens = {0}#

Maximum soft tokens in a request.

int64_t minImageTokensPerImage = {0}#

Minimum soft tokens per image.

int64_t maxImageTokensPerImage = {0}#

Maximum soft tokens per image.

int64_t maxNumImages = {0}#

Maximum number of images per request.

int64_t maxPatchesPerImage = {0}#

Maximum patches per single image.

int64_t rotaryPosEmbDim = {0}#

Gemma4 visual RoPE angle dimension.

float ropeTheta = {100.0F}#

Gemma4 visual RoPE base frequency.

int32_t imageTokenId = {0}#

Token ID for image placeholder.

std::vector<float> imageMean = {}#

Image normalization mean values (RGB)

std::vector<float> imageStd = {}#

Image normalization standard deviation values (RGB)