Gemma4 ViT Runner#
-
class Gemma4ViTRunner : public trt_edgellm::rt::MultimodalRunner#
Runner for Gemma4 vision encoder.
Public Functions
-
Gemma4ViTRunner(std::string const &engineDir, cudaStream_t stream)#
Constructor for Gemma4ViTRunner.
-
~Gemma4ViTRunner() noexcept = default#
- virtual bool preprocess(
- rt::LLMGenerationRequest const &request,
- std::vector<std::vector<int32_t>> &batchedInputIds,
- tokenizer::Tokenizer const *tokenizer,
- rt::OptionalOutputTensor mropeCosSinOut,
- cudaStream_t stream,
- bool imageOnly = false
Preprocess request with images and text.
- Parameters:
request – Generation request with prompts and images
batchedInputIds – Output batched input token IDs
tokenizer – Tokenizer instance
mropeCosSinOut – Output MRope cos/sin cache. Required (
has_value() == true) only for MRope-based multimodal runners (QwenViT, Qwen3OmniAudio), which write per-batch 3D position encodings into it. Passstd::nulloptwhen the base engine uses standard RoPE — runners with standard RoPE (InternViT, Phi4MMViT) do not read this parameter.stream – CUDA stream
imageOnly – When true, only run image preprocessing (skip text tokenization and RoPE generation). Used for benchmarking where only the visual engine inputs need to be set up.
- Returns:
True on success, false on failure
-
virtual bool infer(cudaStream_t stream) noexcept override#
Run multimodal inference.
- Parameters:
stream – CUDA stream
- Returns:
True on success, false on failure
- virtual bool validateAndFillConfig(
- std::string const &engineDir
Validate and fill configuration from file.
- Parameters:
engineDir – Path to engine directory
- Returns:
True on success, false on failure
-
virtual bool allocateBuffer(cudaStream_t stream) override#
Allocate device buffers.
- Returns:
True on success, false on failure
-
Gemma4ViTRunner(std::string const &engineDir, cudaStream_t stream)#
-
struct Gemma4ViTConfig#
Configuration for Gemma4 vision encoder preprocessing and runtime bindings.
Public Members
-
int64_t maxPatches = {0}#
Maximum number of visual patches in a request.
-
int64_t minPatches = {0}#
Minimum number of visual patches in a request.
-
int64_t inputDim = {0}#
Patch input dimension.
-
int64_t outHiddenSize = {0}#
Output hidden dimension size.
-
int64_t patchSize = {16}#
Patch size in pixels.
-
int64_t poolingKernelSize = {3}#
Vision pooler spatial kernel size.
-
int64_t maxImageTokens = {0}#
Maximum soft tokens in a request.
-
int64_t minImageTokensPerImage = {0}#
Minimum soft tokens per image.
-
int64_t maxImageTokensPerImage = {0}#
Maximum soft tokens per image.
-
int64_t maxNumImages = {0}#
Maximum number of images per request.
-
int64_t maxPatchesPerImage = {0}#
Maximum patches per single image.
-
int64_t rotaryPosEmbDim = {0}#
Gemma4 visual RoPE angle dimension.
-
float ropeTheta = {100.0F}#
Gemma4 visual RoPE base frequency.
-
int32_t imageTokenId = {0}#
Token ID for image placeholder.
-
std::vector<float> imageMean = {}#
Image normalization mean values (RGB)
-
std::vector<float> imageStd = {}#
Image normalization standard deviation values (RGB)
-
int64_t maxPatches = {0}#