/* * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "multimodalRunner.h" #include #include #include #include namespace trt_edgellm { namespace rt { //! \brief Configuration for InternViT vision encoder struct InternViTConfig { int64_t maxNumBlocks{0}; //!< Maximum number of image blocks int64_t minNumBlocks{0}; //!< Minimum number of image blocks int64_t numChannels{0}; //!< Number of image channels (typically 3 for RGB) int64_t outHiddenSize{0}; //!< Output hidden dimension size int64_t patchSizeH{0}; //!< Patch height in pixels int64_t patchSizeW{0}; //!< Patch width in pixels int64_t blockImageSizeH{0}; //!< Block image height int64_t blockImageSizeW{0}; //!< Block image width int32_t vocabSize{0}; //!< Vocabulary size int32_t imageTokenId{0}; //!< Token ID for image placeholder int32_t imgStartTokenId{151665}; //!< Token ID for tag int32_t imgEndTokenId{151666}; //!< Token ID for tag std::array imageMean{{0.485F, 0.456F, 0.406F}}; //!< Image normalization mean values (RGB) std::array imageStd{{0.229F, 0.224F, 0.225F}}; //!< Image normalization standard deviation values (RGB) int64_t minImageTokensPerImage{0}; //!< Minimum image tokens generated by each image. Used for resizing. int64_t maxImageTokensPerImage{0}; //!< Maximum image tokens generated by each image. Used for resizing. }; //! \brief Runner for InternViT vision encoder //! //! This class handles the preprocessing and inference of InternViT vision encoder, //! which is part of the InternVL multimodal model. class InternViTRunner : public MultimodalRunner { public: //! \brief Constructor for InternViTRunner //! \param[in] engineDir Directory containing the TensorRT engine files //! \param[in] stream CUDA stream for execution //! \throws std::runtime_error if engineDir does not contain valid engine files //! \throws std::runtime_error if buffer allocation fails //! \throws std::runtime_error if a CUDA error occurs InternViTRunner(std::string const& engineDir, cudaStream_t stream); ~InternViTRunner() noexcept = default; //! \brief Preprocess multimodal input including images and text //! \param[in] request LLM generation request containing images and text //! \param[in,out] batchedInputIds Batched input token IDs after preprocessing //! \param[in] tokenizer Tokenizer for text processing //! \param[in,out] ropeRotaryCosSinDevice RoPE rotary position encoding cache (unused by this model) //! \param[in] stream CUDA stream for execution //! \return True if preprocessing succeeded, false otherwise bool preprocess(rt::LLMGenerationRequest const& request, std::vector>& batchedInputIds, tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream, bool imageOnly = false) noexcept override; //! \brief Run inference on the vision encoder //! \param[in] stream CUDA stream for execution //! \return True if inference succeeded, false otherwise bool infer(cudaStream_t stream) noexcept override; //! \brief Validate and load configuration from JSON file //! \param[in] engineDir Path to engine directory //! \return True if configuration is valid and loaded successfully, false otherwise bool validateAndFillConfig(std::string const& engineDir) override; //! \brief Allocate buffers for inference //! \return True if allocation succeeded, false otherwise //! \throws std::runtime_error if a CUDA error occurs bool allocateBuffer(cudaStream_t stream) override; private: //! \brief Preprocess text portion of the request //! \param[in] request LLM generation request //! \param[out] batchInputIds Batch of input token IDs //! \param[in] numImages Number of images per request //! \param[in] imageTokenLengths Token lengths for each image //! \param[in] tokenizer Tokenizer for text processing void textPreprocess(rt::LLMGenerationRequest const& request, std::vector>& batchInputIds, std::vector const& numImages, std::vector const& imageTokenLengths, trt_edgellm::tokenizer::Tokenizer const* tokenizer); //! \brief Format and process a single image patch //! \param[in] image Input image data //! \param[out] imageTokenLengths Token lengths for each image //! \param[out] numImages Number of images processed //! \param[out] totalNumBlocks Total number of image blocks //! \param[in] isThumbnail Whether the image is a thumbnail //! \param[in] stream CUDA stream for execution //! \throws std::runtime_error if image size is unexpected, or number of blocks is excessive void formatPatch(rt::imageUtils::ImageData const& image, std::vector& imageTokenLengths, int64_t& numImages, int64_t& totalNumBlocks, bool isThumbnail, cudaStream_t stream); //! \brief Preprocess all images in the request //! \param[in] request LLM generation request containing images //! \param[out] imageTokenLengths Token lengths for each image //! \param[out] numImages Number of images per request //! \param[in] doResize Whether to resize images //! \param[in] stream CUDA stream for execution //! \throws std::runtime_error if image size is unexpected, or number of blocks is excessive void imagePreprocess(rt::LLMGenerationRequest const& request, std::vector& imageTokenLengths, std::vector& numImages, bool doResize, cudaStream_t stream); InternViTConfig mConfig; //!< InternViT configuration rt::Tensor mVitInput{}; //!< Vision encoder input tensor rt::Tensor mImageMean{}; //!< Image mean tensor rt::Tensor mImageStd{}; //!< Image standard deviation tensor rt::Tensor mImageDevice{}; //!< Temporary image buffer for preprocessing rt::Tensor mNormalizedImageDevice{}; //!< Temporary normalized image buffer rt::imageUtils::ImageData mResizedImageHost{}; //!< Pre-allocated buffer for image resizing rt::imageUtils::ImageData mThumbnailImageHost{}; //!< Pre-allocated buffer for thumbnail generation }; } // namespace rt } // namespace trt_edgellm