LLM Runtime Utils#
-
struct Message#
Message with role and contents.
Public Members
-
std::vector<MessageContent> contents#
Contents of the message.
-
struct MessageContent#
-
std::vector<MessageContent> contents#
-
struct MessageContent
Public Members
-
std::string type
Content type (text, image)
-
std::string content
Text content when content type is text. Image data will be stored in corresponding imageBuffers.
-
std::string type
-
struct LLMGenerationRequest#
LLM Generation Request structure.
Public Members
-
std::vector<Request> requests#
Vector of requests for a batch.
-
mutable std::vector<FormattedRequest> formattedRequests#
Formatted requests (mutable to allow runtime modification)
-
float temperature#
Temperature parameter for sampling.
-
float topP#
Top-p (nucleus) sampling parameter.
-
int64_t topK#
Top-k sampling parameter.
-
int64_t maxGenerateLength#
Max length of the generated tokens.
-
std::string loraWeightsName = {""}#
Name of the LoRA weights. Default to empty string for no LoRA weights.
-
bool saveSystemPromptKVCache = {false}#
-
bool applyChatTemplate = {true}#
-
bool addGenerationPrompt = {true}#
-
bool enableThinking = {false}#
-
bool disableSpecDecode = {false}#
-
std::vector<std::shared_ptr<StreamChannel>> streamChannels#
Per-slot streaming channels. Size 0 disables streaming globally. When non-empty the size must equal
requests.size()and individual entries may be null to opt out on a per-slot basis. Channels must not already be finished or concurrently attached to another in-flight request.
-
std::vector<Request> requests#
-
struct LLMGenerationResponse#
LLM Generation Response structure.
-
struct LongRopeParams#
Long-Rope specific parameters.
-
struct RopeConfig#
RoPE configuration structure with optional Long-Rope parameters.
Contains common RoPE fields and (optionally) Long-Rope specific parameters when type==kLongRope.
Public Members
-
RopeType type = {RopeType::kDefault}#
Type of RoPE to use.
-
float rotaryScale = {1.0F}#
Scaling factor for rotary embeddings.
-
float rotaryTheta = {100000.0F}#
Base frequency for rotary embeddings.
-
int32_t maxPositionEmbeddings = {32768}#
Maximum position embeddings supported.
-
std::optional<LongRopeParams> longRope = {}#
Long-Rope specific parameters.
-
RopeType type = {RopeType::kDefault}#
-
struct EmbeddingData#
Embedding data - supports both FP16 and FP8 formats.
The embedding table datatype determines the format:
FP16: table is FP16, tableScalingFactor is empty
FP8: table is FP8 (E4M3), tableScalingFactor contains FP32 per-group scales
The kernel functions automatically dispatch based on table.getDataType().
Public Functions
- std::string trt_edgellm::rt::formatRopeConfig(
- RopeConfig const &config
Format rope configuration into string.
- RopeConfig trt_edgellm::rt::collectRopeConfig(
- nlohmann::json const &config
Collect rope configuration from the model config.
Parses the common RoPE fields as well as LongRoPE-specific parameters when the model requests the longrope variant. Default values are used if certain fields are not specified in the model config.
- Parameters:
config – [JSON] The model config file supplied with the model
- Throws:
nlohmann::json::type_error – if JSON value types don’t match expected types
- Returns:
The parsed rope configuration
- bool trt_edgellm::rt::initializeRopeCosSinCache(
- rt::Tensor &cosSinCache,
- RopeConfig const &config,
- cudaStream_t stream
Initialize the rope cos/sin cache tensor for persistent type of RoPE (default, longrope)
- Parameters:
cosSinCache – [GPU] The tensor to store the rope cos/sin cache
config – [RopeConfig] The basic rope configuration
stream – [CUDA stream] The stream to execute the initialization
- Returns:
True if the initialization is successful, false otherwise
- bool trt_edgellm::rt::initializeNopeCosSinCache( ) noexcept#
Initialize an identity cos/sin cache for models without positional encoding (NoPE)
Fills the first half of each position’s rotaryDim with 1.0 (cos) and the second half with 0.0 (sin), making the RoPE kernel a pass-through.
- Parameters:
cosSinCache – [GPU] The tensor to fill, shape [1, maxLength, rotaryDim]
stream – [CUDA stream] The stream to execute the copy
- Returns:
True on success
- bool trt_edgellm::rt::initializeLongRopeCosSinCache(
- rt::Tensor &shortCosSinCache,
- rt::Tensor &longCosSinCache,
- RopeConfig const &config,
- cudaStream_t stream
Initialize the rope cos/sin cache tensor for long rope type.
- Parameters:
shortCosSinCache – [GPU] The tensor to store the short rope cos/sin cache
longCosSinCache – [GPU] The tensor to store the long rope cos/sin cache
config – [RopeConfig] The rope configuration
stream – [CUDA stream] The stream to execute the initialization
- Throws:
std::runtime_error – if CUDA operations fail
- Returns:
True if the initialization is successful, false otherwise
-
template<typename T>
void trt_edgellm::rt::compactVector( - std::vector<int32_t> const &batchMapping,
- std::vector<T> &vec
Compact CPU vector by removing evicted batches.
This utility function compacts a std::vector by removing elements at evicted batch indices. Used for batch eviction to remove finished sequences from CPU context vectors.
- Template Parameters:
T – Element type
- Parameters:
batchMapping – [oldActiveBatch] CPU vector (const input), mapping[i] = newBatchIdx or -1 (evict)
vec – Vector to compact (output, modified in-place)
- Throws:
std::invalid_argument – if sizes of input vectors don’t match
- std::vector<int32_t> trt_edgellm::rt::buildBatchMapping(
- std::vector<int8_t> const &finishedStates
Build batch mapping from finished states.
Creates a mapping vector that maps old batch indices to new batch indices. Finished batches are marked with -1 for eviction.
- Parameters:
finishedStates – [oldActiveBatch] CPU vector indicating which batches are finished (0=not finished, 1=finished)
- Returns:
Vector mapping old batch indices to new indices (-1 for evicted batches)
- EmbeddingData trt_edgellm::rt::loadEmbeddingTable(
- std::filesystem::path const &embeddingPath,
- cudaStream_t stream
Load embedding table from safetensors file (auto-detects FP16 vs FP8 by dtype)
Loads embedding.safetensors and detects format by checking the “embedding” tensor dtype:
FP8: loads “embedding” (FP8) + “embedding_scale” (FP32)
FP16: loads “embedding” (FP16)
- Parameters:
embeddingPath – Path to embedding.safetensors file
stream – CUDA stream for async operations
- Throws:
std::runtime_error – if file not found, tensors missing, or invalid dtypes
- Returns:
EmbeddingData with loaded tensors and format flag
- int32_t trt_edgellm::rt::clampMaxGenerateLengthForKVCapacity(
- std::vector<int32_t> const &effectivePrefillLengths,
- int32_t requestedMaxGenerateLength,
- int32_t kvCacheCapacity,
- int32_t kvCacheReserveLength
Clamp max generation length against KV-cache capacity across the full active batch.
Uses the smallest remaining KV budget across all active sequences so the shared generation limit cannot overrun any batch item.
- Parameters:
effectivePrefillLengths – Effective prefill lengths for each active sequence
requestedMaxGenerateLength – User-requested max generation length
kvCacheCapacity – Total KV-cache capacity available to the runtime
kvCacheReserveLength – Extra KV reserve required by the runtime mode
- Returns:
Clamped max generation length, never below 0
- rt::Tensor trt_edgellm::rt::generateMultimodalIndices(
- rt::Tensor const &inputIds,
- std::optional<int32_t> audioTokenId,
- std::optional<int32_t> imageTokenId,
- int32_t vocabSize
Generate multimodal indices for embeddingLookupMultimodal kernel.
Scans input IDs and generates sequential indices for audio/image embeddings. Audio and image indices are tracked independently, both globally across batches.
- Parameters:
inputIds – Input token IDs on CPU [batchSize, seqLen]
audioTokenId – Special token ID for audio, or std::nullopt if no audio
imageTokenId – Special token ID for image, or std::nullopt if no image
vocabSize – Vocabulary size (tokens >= vocabSize are treated as image tokens)
- Returns:
multimodalIndices tensor on CPU [batchSize, seqLen]