LLM Runtime Utils#

struct Message#

Message with role and contents.

Public Members

std::string role#

Message role (system, user, assistant)

std::vector<MessageContent> contents#

Contents of the message.

struct MessageContent#

Public Members

std::string type#

Content type (text, image)

std::string content#

Text content when content type is text. Image data will be stored in corresponding imageBuffers.

struct MessageContent

Public Members

std::string type

Content type (text, image)

std::string content

Text content when content type is text. Image data will be stored in corresponding imageBuffers.

struct LLMGenerationRequest#

LLM Generation Request structure.

Public Members

std::vector<Request> requests#

Vector of requests for a batch.

mutable std::vector<FormattedRequest> formattedRequests#

Formatted requests (mutable to allow runtime modification)

float temperature#

Temperature parameter for sampling.

float topP#

Top-p (nucleus) sampling parameter.

int64_t topK#

Top-k sampling parameter.

int64_t maxGenerateLength#

Max length of the generated tokens.

std::string loraWeightsName = {""}#

Name of the LoRA weights. Default to empty string for no LoRA weights.

bool saveSystemPromptKVCache = {false}#
bool applyChatTemplate = {true}#
bool addGenerationPrompt = {true}#
bool enableThinking = {false}#
bool disableSpecDecode = {false}#
std::vector<std::shared_ptr<StreamChannel>> streamChannels#

Per-slot streaming channels. Size 0 disables streaming globally. When non-empty the size must equal requests.size() and individual entries may be null to opt out on a per-slot basis. Channels must not already be finished or concurrently attached to another in-flight request.

struct LLMGenerationResponse#

LLM Generation Response structure.

Public Members

std::vector<std::vector<int32_t>> outputIds#

Generated token IDs for each request in the batch.

std::vector<std::string> outputTexts#

Generated text strings for each request in the batch.

struct LongRopeParams#

Long-Rope specific parameters.

Public Members

int32_t originalMaxPositionEmbeddings = {-1}#

Original maximum position embeddings from training.

std::vector<float> longFactor#

Long factor array for each rotary dimension.

std::vector<float> shortFactor#

Short factor array for each rotary dimension.

struct RopeConfig#

RoPE configuration structure with optional Long-Rope parameters.

Contains common RoPE fields and (optionally) Long-Rope specific parameters when type==kLongRope.

Public Members

RopeType type = {RopeType::kDefault}#

Type of RoPE to use.

float rotaryScale = {1.0F}#

Scaling factor for rotary embeddings.

float rotaryTheta = {100000.0F}#

Base frequency for rotary embeddings.

int32_t maxPositionEmbeddings = {32768}#

Maximum position embeddings supported.

std::optional<LongRopeParams> longRope = {}#

Long-Rope specific parameters.

struct EmbeddingData#

Embedding data - supports both FP16 and FP8 formats.

The embedding table datatype determines the format:

  • FP16: table is FP16, tableScalingFactor is empty

  • FP8: table is FP8 (E4M3), tableScalingFactor contains FP32 per-group scales

The kernel functions automatically dispatch based on table.getDataType().

Public Functions

inline rt::OptionalInputTensor scalesAsOptional() const#

Returns scales as OptionalInputTensor (std::nullopt when FP16, reference when FP8)

Public Members

rt::Tensor table#

Embedding table [vocabSize, hiddenSize] (FP16 or FP8)

rt::Tensor tableScalingFactor#

FP32 per-group scales [vocabSize, hiddenSize/128] (only if FP8)

std::string trt_edgellm::rt::formatRopeConfig(
RopeConfig const &config
)#

Format rope configuration into string.

RopeConfig trt_edgellm::rt::collectRopeConfig(
nlohmann::json const &config
)#

Collect rope configuration from the model config.

Parses the common RoPE fields as well as LongRoPE-specific parameters when the model requests the longrope variant. Default values are used if certain fields are not specified in the model config.

Parameters:

config – [JSON] The model config file supplied with the model

Throws:

nlohmann::json::type_error – if JSON value types don’t match expected types

Returns:

The parsed rope configuration

bool trt_edgellm::rt::initializeRopeCosSinCache(
rt::Tensor &cosSinCache,
RopeConfig const &config,
cudaStream_t stream
) noexcept#

Initialize the rope cos/sin cache tensor for persistent type of RoPE (default, longrope)

Parameters:
  • cosSinCache – [GPU] The tensor to store the rope cos/sin cache

  • config – [RopeConfig] The basic rope configuration

  • stream – [CUDA stream] The stream to execute the initialization

Returns:

True if the initialization is successful, false otherwise

bool trt_edgellm::rt::initializeNopeCosSinCache(
rt::Tensor &cosSinCache,
cudaStream_t stream
) noexcept#

Initialize an identity cos/sin cache for models without positional encoding (NoPE)

Fills the first half of each position’s rotaryDim with 1.0 (cos) and the second half with 0.0 (sin), making the RoPE kernel a pass-through.

Parameters:
  • cosSinCache – [GPU] The tensor to fill, shape [1, maxLength, rotaryDim]

  • stream – [CUDA stream] The stream to execute the copy

Returns:

True on success

bool trt_edgellm::rt::initializeLongRopeCosSinCache(
rt::Tensor &shortCosSinCache,
rt::Tensor &longCosSinCache,
RopeConfig const &config,
cudaStream_t stream
)#

Initialize the rope cos/sin cache tensor for long rope type.

Parameters:
  • shortCosSinCache – [GPU] The tensor to store the short rope cos/sin cache

  • longCosSinCache – [GPU] The tensor to store the long rope cos/sin cache

  • config – [RopeConfig] The rope configuration

  • stream – [CUDA stream] The stream to execute the initialization

Throws:

std::runtime_error – if CUDA operations fail

Returns:

True if the initialization is successful, false otherwise

template<typename T>
void trt_edgellm::rt::compactVector(
std::vector<int32_t> const &batchMapping,
std::vector<T> &vec
)#

Compact CPU vector by removing evicted batches.

This utility function compacts a std::vector by removing elements at evicted batch indices. Used for batch eviction to remove finished sequences from CPU context vectors.

Template Parameters:

T – Element type

Parameters:
  • batchMapping – [oldActiveBatch] CPU vector (const input), mapping[i] = newBatchIdx or -1 (evict)

  • vec – Vector to compact (output, modified in-place)

Throws:

std::invalid_argument – if sizes of input vectors don’t match

std::vector<int32_t> trt_edgellm::rt::buildBatchMapping(
std::vector<int8_t> const &finishedStates
)#

Build batch mapping from finished states.

Creates a mapping vector that maps old batch indices to new batch indices. Finished batches are marked with -1 for eviction.

Parameters:

finishedStates – [oldActiveBatch] CPU vector indicating which batches are finished (0=not finished, 1=finished)

Returns:

Vector mapping old batch indices to new indices (-1 for evicted batches)

EmbeddingData trt_edgellm::rt::loadEmbeddingTable(
std::filesystem::path const &embeddingPath,
cudaStream_t stream
)#

Load embedding table from safetensors file (auto-detects FP16 vs FP8 by dtype)

Loads embedding.safetensors and detects format by checking the “embedding” tensor dtype:

  • FP8: loads “embedding” (FP8) + “embedding_scale” (FP32)

  • FP16: loads “embedding” (FP16)

Parameters:
  • embeddingPath – Path to embedding.safetensors file

  • stream – CUDA stream for async operations

Throws:

std::runtime_error – if file not found, tensors missing, or invalid dtypes

Returns:

EmbeddingData with loaded tensors and format flag

int32_t trt_edgellm::rt::clampMaxGenerateLengthForKVCapacity(
std::vector<int32_t> const &effectivePrefillLengths,
int32_t requestedMaxGenerateLength,
int32_t kvCacheCapacity,
int32_t kvCacheReserveLength
)#

Clamp max generation length against KV-cache capacity across the full active batch.

Uses the smallest remaining KV budget across all active sequences so the shared generation limit cannot overrun any batch item.

Parameters:
  • effectivePrefillLengths – Effective prefill lengths for each active sequence

  • requestedMaxGenerateLength – User-requested max generation length

  • kvCacheCapacity – Total KV-cache capacity available to the runtime

  • kvCacheReserveLength – Extra KV reserve required by the runtime mode

Returns:

Clamped max generation length, never below 0

rt::Tensor trt_edgellm::rt::generateMultimodalIndices(
rt::Tensor const &inputIds,
std::optional<int32_t> audioTokenId,
std::optional<int32_t> imageTokenId,
int32_t vocabSize
)#

Generate multimodal indices for embeddingLookupMultimodal kernel.

Scans input IDs and generates sequential indices for audio/image embeddings. Audio and image indices are tracked independently, both globally across batches.

Parameters:
  • inputIds – Input token IDs on CPU [batchSize, seqLen]

  • audioTokenId – Special token ID for audio, or std::nullopt if no audio

  • imageTokenId – Special token ID for image, or std::nullopt if no image

  • vocabSize – Vocabulary size (tokens >= vocabSize are treated as image tokens)

Returns:

multimodalIndices tensor on CPU [batchSize, seqLen]