LLM Runtime Utils#

struct Message#

Message with role and contents.

Public Members

std::string role#

Message role (system, user, assistant)

std::vector<MessageContent> contents#

Contents of the message.

struct MessageContent#

Public Members

std::string type#

Content type (text, image)

std::string content#

Text content when content type is text. Image data will be stored in corresponding imageBuffers.

struct MessageContent

Public Members

std::string type

Content type (text, image)

std::string content

Text content when content type is text. Image data will be stored in corresponding imageBuffers.

struct LLMGenerationRequest#

LLM Generation Request structure.

Public Members

std::vector<Request> requests#

Vector of requests for a batch.

mutable std::vector<FormattedRequest> formattedRequests#

Formatted requests (mutable to allow runtime modification)

float temperature#

Temperature parameter for sampling.

float topP#

Top-p (nucleus) sampling parameter.

int64_t topK#

Top-k sampling parameter.

int64_t maxGenerateLength#

Max length of the generated tokens.

std::string loraWeightsName = {""}#

Name of the LoRA weights. Default to empty string for no LoRA weights.

bool saveSystemPromptKVCache{false}#

Whether to save system prompt KV cache of this request to be used by later requests.

bool applyChatTemplate = {true}#

Whether to apply chat template formatting. If false, raw messages will be concatenated without special tokens.

bool addGenerationPrompt = {true}#

Whether to add generation prompt (e.g., assistant header) at the end. Only effective when applyChatTemplate=true

bool enableThinking = {false}#

Whether to enable thinking mode for models that support it. Default is disabled.

struct LLMGenerationResponse#

LLM Generation Response structure.

Public Members

std::vector<std::vector<int32_t>> outputIds#

Generated token IDs for each request in the batch.

std::vector<std::string> outputTexts#

Generated text strings for each request in the batch.

struct LongRopeParams#

Long-Rope specific parameters.

Public Members

int32_t originalMaxPositionEmbeddings = {-1}#

Original maximum position embeddings from training.

std::vector<float> longFactor#

Long factor array for each rotary dimension.

std::vector<float> shortFactor#

Short factor array for each rotary dimension.

struct RopeConfig#

RoPE configuration structure with optional Long-Rope parameters.

Contains common RoPE fields and (optionally) Long-Rope specific parameters when type==kLongRope.

Public Members

RopeType type = {RopeType::kDefault}#

Type of RoPE to use.

float rotaryScale = {1.0F}#

Scaling factor for rotary embeddings.

float rotaryTheta = {100000.0F}#

Base frequency for rotary embeddings.

int32_t maxPositionEmbeddings = {32768}#

Maximum position embeddings supported.

std::optional<LongRopeParams> longRope = {}#

Long-Rope specific parameters.

std::string trt_edgellm::rt::formatRopeConfig(
RopeConfig const &config
)#

Format rope configuration into string.

RopeConfig trt_edgellm::rt::collectRopeConfig(
nlohmann::json const &config
)#

Collect rope configuration from the model config.

Parses the common RoPE fields as well as LongRoPE-specific parameters when the model requests the longrope variant. Default values are used if certain fields are not specified in the model config.

Parameters:

config – [JSON] The model config file supplied with the model

Returns:

The parsed rope configuration

bool trt_edgellm::rt::initializeRopeCosSinCache(
rt::Tensor &cosSinCache,
RopeConfig const &config,
nlohmann::json const &modelConfig,
cudaStream_t stream
)#

Initialize the rope cos/sin cache tensor for persistent type of RoPE (default, longrope)

Parameters:
  • cosSinCache – [GPU] The tensor to store the rope cos/sin cache

  • config – [RopeConfig] The basic rope configuration

  • modelConfig – [JSON] Model config json that can supply additional information for the rope initialization

  • stream – [CUDA stream] The stream to execute the initialization

Returns:

True if the initialization is successful, false otherwise

bool trt_edgellm::rt::initializeLongRopeCosSinCache(
rt::Tensor &shortCosSinCache,
rt::Tensor &longCosSinCache,
RopeConfig const &config,
nlohmann::json const &modelConfig,
cudaStream_t stream
)#

Initialize the rope cos/sin cache tensor for long rope type.

Parameters:
  • shortCosSinCache – [GPU] The tensor to store the short rope cos/sin cache

  • longCosSinCache – [GPU] The tensor to store the long rope cos/sin cache

  • config – [RopeConfig] The rope configuration

  • modelConfig – [JSON] Model config json that can supply additional information for the rope initialization

  • stream – [CUDA stream] The stream to execute the initialization

Returns:

True if the initialization is successful, false otherwise

template<typename T>
void trt_edgellm::rt::compactVector(
std::vector<int32_t> const &batchMapping,
std::vector<T> &vec
)#

Compact CPU vector by removing evicted batches.

This utility function compacts a std::vector by removing elements at evicted batch indices. Used for batch eviction to remove finished sequences from CPU context vectors.

Template Parameters:

T – Element type

Parameters:
  • batchMapping – [oldActiveBatch] CPU vector (const input), mapping[i] = newBatchIdx or -1 (evict)

  • vec – Vector to compact (output, modified in-place)

std::vector<int32_t> trt_edgellm::rt::buildBatchMapping(
std::vector<int8_t> const &finishedStates
)#

Build batch mapping from finished states.

Creates a mapping vector that maps old batch indices to new batch indices. Finished batches are marked with -1 for eviction.

Parameters:

finishedStates – [oldActiveBatch] CPU vector indicating which batches are finished (0=not finished, 1=finished)

Returns:

Vector mapping old batch indices to new indices (-1 for evicted batches)