Phi4mm ViT Runner#
-
class Phi4MMViTRunner : public trt_edgellm::rt::MultimodalRunner#
Runner for Phi-4MM vision encoder.
This class handles:
Image preprocessing (HWC uint8 → normalized FP16 HWC on GPU)
Tiling to per-block CHW layout for the TRT visual engine
Running the visual engine to produce raw 256-per-block visual tokens
Batched HD postprocess to assemble sub/global grids with newline and GN tokens
Text preprocessing to expand image placeholders into a contiguous id range
Public Functions
-
Phi4MMViTRunner(std::string const &engineDir, cudaStream_t stream)#
Constructor for Phi4MMViTRunner.
- Parameters:
engineDir – [in] Directory containing the TensorRT engine files
stream – [in] CUDA stream for execution
-
~Phi4MMViTRunner() = default#
- virtual bool preprocess(
- rt::LLMGenerationRequest const &request,
- std::vector<std::vector<int32_t>> &batchedInputIds,
- tokenizer::Tokenizer *tokenizer,
- rt::Tensor &ropeRotaryCosSinDevice,
- cudaStream_t stream
Preprocess multimodal input including images and text.
- Parameters:
request – [in] LLM generation request containing images and text
batchedInputIds – [inout] Batched input token IDs after preprocessing
tokenizer – [in] Tokenizer for text processing
ropeRotaryCosSinDevice – [inout] RoPE rotary position encoding cache (unused for Phi-4MM)
stream – [in] CUDA stream for execution
- Returns:
True if preprocessing succeeded, false otherwise
-
virtual bool infer(cudaStream_t stream) override#
Run inference on the vision encoder and perform HD postprocess.
- Parameters:
stream – [in] CUDA stream for execution
- Returns:
True if inference succeeded, false otherwise
- virtual bool validateAndFillConfig(
- std::string const &configPath
Validate and load configuration from JSON file.
- Parameters:
configPath – [in] Path to configuration file
- Returns:
True if configuration is valid and loaded successfully, false otherwise
-
virtual bool allocateBuffer(cudaStream_t stream) override#
Allocate buffers for inference and postprocess.
- Parameters:
stream – [in] CUDA stream for execution
- Returns:
True if allocation succeeded, false otherwise
-
struct Phi4MMViTConfig#
Configuration for Phi4MMViT vision encoder.
This configuration aggregates vision-tower-derived dimensions (num blocks, channels, output hidden size), tokenizer-related settings for image token expansion, and image normalization parameters used by the CUDA preprocess kernels.
Public Members
-
int32_t maxNumBlocks = {0}#
Maximum number of image blocks supported by engine.
-
int32_t minNumBlocks = {0}#
Minimum number of image blocks supported by engine.
-
int32_t numChannels = {3}#
Image channels (RGB=3)
-
int32_t outHiddenSize = {0}#
Visual output hidden size (projection dim)
-
int32_t imageTokenId = {200010}#
Placeholder token id in text to be expanded into image tokens.
-
int32_t vocabSize = {0}#
Base vocabulary size; image ids start from vocabSize.
-
std::array<float, 3> imageMean = {{0.5F, 0.5F, 0.5F}}#
Mean per channel used in normalize: (val/255 - mean)/std.
-
std::array<float, 3> imageStd = {{0.5F, 0.5F, 0.5F}}#
Std per channel used in normalize.
-
int32_t minImageTokensPerImage = {0}#
Min visual tokens per image (for resize/grid selection)
-
int32_t maxImageTokensPerImage = {0}#
Max visual tokens per image (for resize/grid selection)
-
int32_t blockImageSizeH = {0}#
Block image height (crop size)
-
int32_t blockImageSizeW = {0}#
Block image width (crop size)
-
int32_t blockDownsampleRatio = {28}#
Block downsample ratio.
-
int32_t tokensPerSide = {0}#
Number of tokens per dimension.
-
int32_t maxNumBlocks = {0}#