/*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "multimodalRunner.h"
#include <array>
#include <common/tensor.h>
#include <cuda_fp16.h>
#include <vector>

namespace trt_edgellm
{
namespace rt
{

//! \brief Configuration for InternViT vision encoder
struct InternViTConfig
{
    int64_t maxNumBlocks{0};                                  //!< Maximum number of image blocks
    int64_t minNumBlocks{0};                                  //!< Minimum number of image blocks
    int64_t numChannels{0};                                   //!< Number of image channels (typically 3 for RGB)
    int64_t outHiddenSize{0};                                 //!< Output hidden dimension size
    int64_t patchSizeH{0};                                    //!< Patch height in pixels
    int64_t patchSizeW{0};                                    //!< Patch width in pixels
    int64_t blockImageSizeH{0};                               //!< Block image height
    int64_t blockImageSizeW{0};                               //!< Block image width
    int32_t vocabSize{0};                                     //!< Vocabulary size
    int32_t imageTokenId{0};                                  //!< Token ID for image placeholder
    int32_t imgStartTokenId{151665};                          //!< Token ID for <img> tag
    int32_t imgEndTokenId{151666};                            //!< Token ID for </img> tag
    std::array<float, 3> imageMean{{0.485F, 0.456F, 0.406F}}; //!< Image normalization mean values (RGB)
    std::array<float, 3> imageStd{{0.229F, 0.224F, 0.225F}};  //!< Image normalization standard deviation values (RGB)
    int64_t minImageTokensPerImage{0}; //!< Minimum image tokens generated by each image. Used for resizing.
    int64_t maxImageTokensPerImage{0}; //!< Maximum image tokens generated by each image. Used for resizing.
};

//! \brief Runner for InternViT vision encoder
//!
//! This class handles the preprocessing and inference of InternViT vision encoder,
//! which is part of the InternVL multimodal model.
class InternViTRunner : public MultimodalRunner
{
public:
    //! \brief Constructor for InternViTRunner
    //! \param[in] engineDir Directory containing the TensorRT engine files
    //! \param[in] stream CUDA stream for execution
    //! \throws std::runtime_error if engineDir does not contain valid engine files
    //! \throws std::runtime_error if buffer allocation fails
    //! \throws std::runtime_error if a CUDA error occurs
    InternViTRunner(std::string const& engineDir, cudaStream_t stream);

    ~InternViTRunner() noexcept = default;

    //! \brief Preprocess multimodal input including images and text
    //! \param[in] request LLM generation request containing images and text
    //! \param[in,out] batchedInputIds Batched input token IDs after preprocessing
    //! \param[in] tokenizer Tokenizer for text processing
    //! \param[in,out] ropeRotaryCosSinDevice RoPE rotary position encoding cache (unused by this model)
    //! \param[in] stream CUDA stream for execution
    //! \return True if preprocessing succeeded, false otherwise
    bool preprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchedInputIds,
        tokenizer::Tokenizer const* tokenizer, [[maybe_unused]] rt::Tensor& ropeRotaryCosSinDevice, cudaStream_t stream,
        bool imageOnly = false) noexcept override;

    //! \brief Run inference on the vision encoder
    //! \param[in] stream CUDA stream for execution
    //! \return True if inference succeeded, false otherwise
    bool infer(cudaStream_t stream) noexcept override;

    //! \brief Validate and load configuration from JSON file
    //! \param[in] engineDir Path to engine directory
    //! \return True if configuration is valid and loaded successfully, false otherwise
    bool validateAndFillConfig(std::string const& engineDir) override;

    //! \brief Allocate buffers for inference
    //! \return True if allocation succeeded, false otherwise
    //! \throws std::runtime_error if a CUDA error occurs
    bool allocateBuffer(cudaStream_t stream) override;

private:
    //! \brief Preprocess text portion of the request
    //! \param[in] request LLM generation request
    //! \param[out] batchInputIds Batch of input token IDs
    //! \param[in] numImages Number of images per request
    //! \param[in] imageTokenLengths Token lengths for each image
    //! \param[in] tokenizer Tokenizer for text processing
    void textPreprocess(rt::LLMGenerationRequest const& request, std::vector<std::vector<int32_t>>& batchInputIds,
        std::vector<int64_t> const& numImages, std::vector<int64_t> const& imageTokenLengths,
        trt_edgellm::tokenizer::Tokenizer const* tokenizer);

    //! \brief Format and process a single image patch
    //! \param[in] image Input image data
    //! \param[out] imageTokenLengths Token lengths for each image
    //! \param[out] numImages Number of images processed
    //! \param[out] totalNumBlocks Total number of image blocks
    //! \param[in] isThumbnail Whether the image is a thumbnail
    //! \param[in] stream CUDA stream for execution
    //! \throws std::runtime_error if image size is unexpected, or number of blocks is excessive
    void formatPatch(rt::imageUtils::ImageData const& image, std::vector<int64_t>& imageTokenLengths,
        int64_t& numImages, int64_t& totalNumBlocks, bool isThumbnail, cudaStream_t stream);

    //! \brief Preprocess all images in the request
    //! \param[in] request LLM generation request containing images
    //! \param[out] imageTokenLengths Token lengths for each image
    //! \param[out] numImages Number of images per request
    //! \param[in] doResize Whether to resize images
    //! \param[in] stream CUDA stream for execution
    //! \throws std::runtime_error if image size is unexpected, or number of blocks is excessive
    void imagePreprocess(rt::LLMGenerationRequest const& request, std::vector<int64_t>& imageTokenLengths,
        std::vector<int64_t>& numImages, bool doResize, cudaStream_t stream);

    InternViTConfig mConfig;                         //!< InternViT configuration
    rt::Tensor mVitInput{};                          //!< Vision encoder input tensor
    rt::Tensor mImageMean{};                         //!< Image mean tensor
    rt::Tensor mImageStd{};                          //!< Image standard deviation tensor
    rt::Tensor mImageDevice{};                       //!< Temporary image buffer for preprocessing
    rt::Tensor mNormalizedImageDevice{};             //!< Temporary normalized image buffer
    rt::imageUtils::ImageData mResizedImageHost{};   //!< Pre-allocated buffer for image resizing
    rt::imageUtils::ImageData mThumbnailImageHost{}; //!< Pre-allocated buffer for thumbnail generation
};

} // namespace rt
} // namespace trt_edgellm
