LLM Runtime Utils#
-
struct Message#
Message with role and contents.
Public Members
-
std::vector<MessageContent> contents#
Contents of the message.
-
struct MessageContent#
-
std::vector<MessageContent> contents#
-
struct MessageContent
Public Members
-
std::string type
Content type (text, image)
-
std::string content
Text content when content type is text. Image data will be stored in corresponding imageBuffers.
-
std::string type
-
struct LLMGenerationRequest#
LLM Generation Request structure.
Public Members
-
std::vector<Request> requests#
Vector of requests for a batch.
-
mutable std::vector<FormattedRequest> formattedRequests#
Formatted requests (mutable to allow runtime modification)
-
float temperature#
Temperature parameter for sampling.
-
float topP#
Top-p (nucleus) sampling parameter.
-
int64_t topK#
Top-k sampling parameter.
-
int64_t maxGenerateLength#
Max length of the generated tokens.
-
std::string loraWeightsName = {""}#
Name of the LoRA weights. Default to empty string for no LoRA weights.
-
bool saveSystemPromptKVCache{false}#
Whether to save system prompt KV cache of this request to be used by later requests.
-
bool applyChatTemplate = {true}#
Whether to apply chat template formatting. If false, raw messages will be concatenated without special tokens.
-
bool addGenerationPrompt = {true}#
Whether to add generation prompt (e.g., assistant header) at the end. Only effective when applyChatTemplate=true
-
bool enableThinking = {false}#
Whether to enable thinking mode for models that support it. Default is disabled.
-
std::vector<Request> requests#
-
struct LLMGenerationResponse#
LLM Generation Response structure.
-
struct LongRopeParams#
Long-Rope specific parameters.
-
struct RopeConfig#
RoPE configuration structure with optional Long-Rope parameters.
Contains common RoPE fields and (optionally) Long-Rope specific parameters when type==kLongRope.
Public Members
-
RopeType type = {RopeType::kDefault}#
Type of RoPE to use.
-
float rotaryScale = {1.0F}#
Scaling factor for rotary embeddings.
-
float rotaryTheta = {100000.0F}#
Base frequency for rotary embeddings.
-
int32_t maxPositionEmbeddings = {32768}#
Maximum position embeddings supported.
-
std::optional<LongRopeParams> longRope = {}#
Long-Rope specific parameters.
-
RopeType type = {RopeType::kDefault}#
- std::string trt_edgellm::rt::formatRopeConfig(
- RopeConfig const &config
Format rope configuration into string.
- RopeConfig trt_edgellm::rt::collectRopeConfig(
- nlohmann::json const &config
Collect rope configuration from the model config.
Parses the common RoPE fields as well as LongRoPE-specific parameters when the model requests the longrope variant. Default values are used if certain fields are not specified in the model config.
- Parameters:
config – [JSON] The model config file supplied with the model
- Returns:
The parsed rope configuration
- bool trt_edgellm::rt::initializeRopeCosSinCache(
- rt::Tensor &cosSinCache,
- RopeConfig const &config,
- nlohmann::json const &modelConfig,
- cudaStream_t stream
Initialize the rope cos/sin cache tensor for persistent type of RoPE (default, longrope)
- Parameters:
cosSinCache – [GPU] The tensor to store the rope cos/sin cache
config – [RopeConfig] The basic rope configuration
modelConfig – [JSON] Model config json that can supply additional information for the rope initialization
stream – [CUDA stream] The stream to execute the initialization
- Returns:
True if the initialization is successful, false otherwise
- bool trt_edgellm::rt::initializeLongRopeCosSinCache(
- rt::Tensor &shortCosSinCache,
- rt::Tensor &longCosSinCache,
- RopeConfig const &config,
- nlohmann::json const &modelConfig,
- cudaStream_t stream
Initialize the rope cos/sin cache tensor for long rope type.
- Parameters:
shortCosSinCache – [GPU] The tensor to store the short rope cos/sin cache
longCosSinCache – [GPU] The tensor to store the long rope cos/sin cache
config – [RopeConfig] The rope configuration
modelConfig – [JSON] Model config json that can supply additional information for the rope initialization
stream – [CUDA stream] The stream to execute the initialization
- Returns:
True if the initialization is successful, false otherwise
-
template<typename T>
void trt_edgellm::rt::compactVector( - std::vector<int32_t> const &batchMapping,
- std::vector<T> &vec
Compact CPU vector by removing evicted batches.
This utility function compacts a std::vector by removing elements at evicted batch indices. Used for batch eviction to remove finished sequences from CPU context vectors.
- Template Parameters:
T – Element type
- Parameters:
batchMapping – [oldActiveBatch] CPU vector (const input), mapping[i] = newBatchIdx or -1 (evict)
vec – Vector to compact (output, modified in-place)
- std::vector<int32_t> trt_edgellm::rt::buildBatchMapping(
- std::vector<int8_t> const &finishedStates
Build batch mapping from finished states.
Creates a mapping vector that maps old batch indices to new batch indices. Finished batches are marked with -1 for eviction.
- Parameters:
finishedStates – [oldActiveBatch] CPU vector indicating which batches are finished (0=not finished, 1=finished)
- Returns:
Vector mapping old batch indices to new indices (-1 for evicted batches)