LLM Inference Runtime#

Architecture#

The LLM Inference Runtime provides standard autoregressive generation for both text-only and multimodal (VLM) inference using a single LLMEngineRunner.

        %%{init: {'theme':'neutral', 'themeVariables': {'primaryColor':'#76B900','primaryTextColor':'#fff','primaryBorderColor':'#5a8f00','lineColor':'#666','edgeLabelBackground':'#ffffff','labelTextColor':'#000','clusterBkg':'#ffffff','clusterBorder':'#999'}}}%%
graph TB
    CLIENT1[Client Application]
    STANDARD_RT[LLM Inference Runtime]
    LLM_RUNNER1[LLM Engine Runner]
    TOKENIZER1[Tokenizer]
    MULTIMODAL1[Multimodal Runner]
    KV_CACHE1[Linear KV Cache]
    SAMPLING1[Sampling Kernels]
    STANDARD_ENGINE[TRT Engine]

    %% Connections
    CLIENT1 -->|handleRequest| STANDARD_RT
    STANDARD_RT -->|owns & manages| LLM_RUNNER1
    STANDARD_RT -->|owns optional| MULTIMODAL1
    STANDARD_RT -->|owns| TOKENIZER1
    STANDARD_RT -->|calls| SAMPLING1

    LLM_RUNNER1 -->|owns & manages| KV_CACHE1
    LLM_RUNNER1 -->|executes| STANDARD_ENGINE

    MULTIMODAL1 -->|provides embeddings| LLM_RUNNER1
    TOKENIZER1 -->|encode/decode| LLM_RUNNER1

    %% Styling
    classDef nvNode fill:#76B900,stroke:#5a8f00,stroke-width:1px,color:#fff
    classDef greyNode fill:#f5f5f5,stroke:#999,stroke-width:1px,color:#333
    classDef darkNode fill:#ffffff,stroke:#999,stroke-width:1px,color:#333
    classDef inputNode fill:#f5f5f5,stroke:#999,stroke-width:1px,color:#333

    class CLIENT1 inputNode
    class STANDARD_RT,LLM_RUNNER1,TOKENIZER1,MULTIMODAL1,KV_CACHE1,SAMPLING1 nvNode
    class STANDARD_ENGINE greyNode

Key Components#

Component	Description
LLM Engine Runner	Executes TensorRT engines and manages dual-phase inference. Core engine execution component owned by LLM Inference Runtime (as `mLLMEngineRunner`). Uses a single TensorRT execution context that switches optimization profiles between prefill and generation phases. Manages its own Linear KV Cache instance, produces logits that are consumed by the runtime’s sampling calls, and provides CUDA graph optimization for reduced latency. Supports dynamic LoRA adapter switching. Files: `cpp/runtime/llmEngineRunner.{h,cpp}`
Tokenizer	HuggingFace-compatible text tokenization system. Converts between text and token IDs using Byte-Pair Encoding (BPE). The LLM Inference Runtime owns its own tokenizer instance. Supports various model vocabularies (GPT, Llama, Qwen) with configurable special tokens and preprocessing steps. Files: `cpp/tokenizer/tokenizer.{h,cpp}`, `preTokenizer.{h,cpp}`, `tokenEncoder.{h,cpp}`
Multimodal Runner	Vision processing for multimodal models (VLMs). Processes image inputs through Vision Transformer models and generates vision embeddings. Supports Qwen-VL and InternVL architectures with dynamic image token generation. Integrates vision embeddings with text tokens for multimodal inference. Files: `cpp/multimodal/multimodalRunner.{h,cpp}`, `qwenViTRunner.{h,cpp}`, `internViTRunner.{h,cpp}`
Linear KV Cache	Attention key-value cache management. The LLM Engine Runner maintains its own Linear KV Cache instance. Stores attention key-value pairs across inference steps for efficient autoregressive generation. Uses linear memory layout optimized for GPU access with support for batched processing and variable sequence lengths. Files: `cpp/runtime/linearKVCache.{h,cpp}`
Sampling Kernels	Token generation from model logits. Converts model output logits into probability distributions and samples the next token using configurable strategies (greedy, top-k, top-p, temperature). Called directly by the LLM Inference Runtime (not by engine runner) after engine execution produces logits. Operates on GPU for efficient batch processing. Files: `cpp/sampler/sampling.{cu,h}`
TRT Engine	TensorRT inference engine. Optimized TensorRT engine compiled from ONNX models. The LLM Inference Runtime uses a single engine loaded and executed by the LLM Engine Runner. Provides high-performance inference through TensorRT optimizations including kernel fusion, precision calibration, and memory optimization. Files: Built by `llm_build` (see Engine Builder section)

Inference Workflow#

The LLM Inference Runtime implements a dual-phase processing architecture optimized for autoregressive language model generation.

        %%{init: {'theme':'neutral', 'themeVariables': {'primaryColor':'#76B900','primaryTextColor':'#fff','primaryBorderColor':'#5a8f00','lineColor':'#666','edgeLabelBackground':'#ffffff','labelTextColor':'#000','clusterBkg':'#ffffff','clusterBorder':'#999'}}}%%
graph LR
    INPUT_PROMPT(Input<br/>Prompt) --> TOKENIZER(Tokenize)

    subgraph VIT_BOX ["Optional"]
        VIT_PROCESS(ViT<br>Processing)
    end

    TOKENIZER --> VIT_PROCESS
    VIT_PROCESS --> PREFILL_ENGINE(Prefill<br/>**TRT Engine**)

    PREFILL_ENGINE --> GENERATE_KV[Generate<br/>KV-Cache]
    GENERATE_KV --> SAMPLE_FIRST(Sample First<br/>Token)

    SAMPLE_FIRST --> GENERATION_ENGINE[Generation<br/>**TRT Engine** or<br>**CUDA Graph**]

    GENERATION_ENGINE --> UPDATE_KV(Update<br>KV Cache)
    UPDATE_KV --> SAMPLE_TOKEN(Sample Next<br/>Token)

    SAMPLE_TOKEN --> STOP_CHECK{Stop<br/>Condition?}
    STOP_CHECK -->|N| GENERATION_ENGINE
    STOP_CHECK -->|Y| OUTPUT_SEQUENCE(Generated<br/>Sequence)

    subgraph PHASE1 ["Phase 1: Prefill"]
        PREFILL_ENGINE
        GENERATE_KV
        SAMPLE_FIRST
    end

    subgraph PHASE2 ["Phase 2: Generation"]
        GENERATION_ENGINE
        UPDATE_KV
        SAMPLE_TOKEN
        STOP_CHECK
        OUTPUT_SEQUENCE
    end

    classDef greyNode fill:#f5f5f5,stroke:#999,stroke-width:1px,color:#333
    classDef darkNode fill:#ffffff,stroke:#999,stroke-width:1px,color:#333
    classDef nvNode fill:#76B900,stroke:#5a8f00,stroke-width:1px,color:#fff
    classDef nvLightNode fill:#b8d67e,stroke:#76B900,stroke-width:1px,color:#333
    classDef inputNode fill:#f5f5f5,stroke:#999,stroke-width:1px,color:#333
    classDef itemNode fill:#ffffff,stroke:#999,stroke-width:1px,color:#333
    classDef lightSubGraph fill:none,stroke:#aaa,stroke-width:1.5px
    classDef optionalBox fill:none,stroke:#aaa,stroke-width:1px,stroke-dasharray:5 5

    class INPUT_PROMPT inputNode
    class TOKENIZER,SAMPLE_FIRST,SAMPLE_TOKEN,VIT_PROCESS,STOP_CHECK,PREFILL_ENGINE,GENERATION_ENGINE greyNode
    class GENERATE_KV,UPDATE_KV nvLightNode
    class OUTPUT_SEQUENCE darkNode
    class PHASE1,PHASE2 lightSubGraph
    class VIT_BOX optionalBox

Inference Phases#

Phase 1: Prefill Processing

The prefill phase processes the entire input prompt in parallel to establish the initial inference state:

Input Processing: Text is tokenized and padded to batch requirements
Multimodal Integration: For VLMs, vision embeddings are processed through ViT components and integrated with text embeddings
Parallel Execution: All prompt tokens are processed simultaneously through transformer layers
KV-Cache Generation: Key-value cache is populated for all prompt tokens
First Token Sampling: Initial generated token is sampled from output logits

Phase 2: Generation (Autoregressive Decode)

The generation phase operates autoregressively, processing one token at a time:

Sequential Processing: Each iteration processes the previously generated token
KV-Cache Reuse: Leverages accumulated key-value cache from previous steps
CUDA Graph Optimization: Optional CUDA graph capture reduces kernel launch overhead by 10-30%
Sampling Strategies: Configurable token generation (greedy, top-k, top-p, temperature)
Stopping Criteria: Continues until EOS token, maximum length, or custom conditions

Usage Examples#

Standard LLM Inference#

#include "runtime/llmInferenceRuntime.h"
#include <unordered_map>

// Initialize CUDA stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));

// Initialize runtime (4 parameters: engineDir, multimodalEngineDir, loraWeightsMap, stream)
std::unordered_map<std::string, std::string> loraWeightsMap{}; // Empty for no LoRA
LLMInferenceRuntime runtime(engineDir, "", loraWeightsMap, stream);

// Prepare request
LLMGenerationRequest request;
request.requests.resize(1);
Message userMsg;
userMsg.role = "user";
userMsg.contents.push_back({"text", "What is the capital of France?"});
request.requests[0].messages.push_back(std::move(userMsg));
request.maxGenerateLength = 100;
request.temperature = 1.0;
request.topK = 50;
request.topP = 0.8;

// Prepare response
LLMGenerationResponse response;

// Execute inference (3 parameters: request, response, stream)
if (runtime.handleRequest(request, response, stream)) {
    std::cout << "Generated: " << response.outputTexts[0] << std::endl;
}

// Cleanup
CUDA_CHECK(cudaStreamDestroy(stream));

LoRA Adapter Switching#

#include "runtime/llmInferenceRuntime.h"
#include <unordered_map>

// Initialize CUDA stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));

// Initialize runtime with LoRA weights map
std::unordered_map<std::string, std::string> loraWeightsMap{
    {"medical", "lora_weights/medical_adapter.safetensors"},
    {"legal", "lora_weights/legal_adapter.safetensors"}
};
LLMInferenceRuntime runtime(engineDir, "", loraWeightsMap, stream);

// Prepare requests
LLMGenerationRequest medicalRequest;
medicalRequest.requests.resize(1);
{
    Message msg;
    msg.role = "user";
    msg.contents.push_back({"text", "Medical question"});
    medicalRequest.requests[0].messages.push_back(std::move(msg));
}
medicalRequest.loraWeightsName = "medical";
medicalRequest.maxGenerateLength = 100;

LLMGenerationRequest legalRequest;
legalRequest.requests.resize(1);
{
    Message msg;
    msg.role = "user";
    msg.contents.push_back({"text", "Legal question"});
    legalRequest.requests[0].messages.push_back(std::move(msg));
}
legalRequest.loraWeightsName = "legal";
legalRequest.maxGenerateLength = 100;

// Execute inference with different LoRA adapters
LLMGenerationResponse medicalResponse;
runtime.handleRequest(medicalRequest, medicalResponse, stream);

LLMGenerationResponse legalResponse;
runtime.handleRequest(legalRequest, legalResponse, stream);

// Disable LoRA (use empty string)
LLMGenerationRequest baseRequest;
baseRequest.requests.resize(1);
{
    Message msg;
    msg.role = "user";
    msg.contents.push_back({"text", "Base question"});
    baseRequest.requests[0].messages.push_back(std::move(msg));
}
baseRequest.loraWeightsName = "";
baseRequest.maxGenerateLength = 100;

LLMGenerationResponse baseResponse;
runtime.handleRequest(baseRequest, baseResponse, stream);

// Cleanup
CUDA_CHECK(cudaStreamDestroy(stream));

Multimodal VLM Inference#

#include "runtime/llmInferenceRuntime.h"
#include "runtime/imageUtils.h"
#include <unordered_map>

// Initialize CUDA stream
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));

// Initialize multimodal runtime (4 parameters: engineDir, multimodalEngineDir, loraWeightsMap, stream)
std::unordered_map<std::string, std::string> loraWeightsMap{}; // Empty for no LoRA
LLMInferenceRuntime runtime(engineDir, visualEngineDir, loraWeightsMap, stream);

// Prepare multimodal request
LLMGenerationRequest request;
request.requests.resize(1);
Message mmMsg;
mmMsg.role = "user";
// Content metadata in messages should align with the external buffers order.
mmMsg.contents.push_back({"image", "/path/to/image.jpg"});
mmMsg.contents.push_back({"text", "What's in this image?"});
request.requests[0].messages.push_back(std::move(mmMsg));
// Image bytes are provided separately via imageBuffers.
request.requests[0].imageBuffers.push_back(imageUtils::loadImageFromFile("/path/to/image.jpg"));
request.maxGenerateLength = 150;
request.temperature = 1.0;
request.topK = 50;
request.topP = 0.8;

// Prepare response
LLMGenerationResponse response;

// Execute inference (3 parameters: request, response, stream)
if (runtime.handleRequest(request, response, stream)) {
    std::cout << "Generated: " << response.outputTexts[0] << std::endl;
}

// Cleanup
CUDA_CHECK(cudaStreamDestroy(stream));