LLM Inference Runtime#
-
class LLMInferenceRuntime#
LLM Inference Runtime for handling generation requests.
Public Functions
- LLMInferenceRuntime(
- std::string const &engineDir,
- std::string const &multimodalEngineDir,
- std::unordered_map<std::string, std::string> const &loraWeightsMap,
- cudaStream_t stream
Construct an LLM Inference Runtime.
- Parameters:
engineDir – Directory containing the LLM engine
multimodalEngineDir – Directory containing the multimodal engine
loraWeightsMap – Map of LoRA weights names to their paths
stream – CUDA stream for initialization
-
~LLMInferenceRuntime() = default#
Destructor.
- bool handleRequest(
- LLMGenerationRequest const &request,
- LLMGenerationResponse &response,
- cudaStream_t stream
Handle an LLM generation request.
- Parameters:
request – The generation request containing prompt and generation parameters
response – The generation response to be filled with output
stream – CUDA stream for execution
- Returns:
True if request was handled successfully, false otherwise
-
bool captureDecodingCUDAGraph(cudaStream_t stream)#
Capture CUDA graph for the decoding step to optimize performance.
- Parameters:
stream – CUDA stream for graph capture
- Returns:
True if graph was captured successfully, false otherwise
- bool genAndSaveSystemPromptKVCache(
- std::string const &prompt,
- std::string const &loraWeightsName,
- cudaStream_t stream
Execute the prefill step generation of the KVCache for the prompt and save for later usage.
- Parameters:
prompt – The system prompt to generate the KVCache
loraWeightsName – The name of the LoRA weights
stream – The CUDA stream used for the generation
- Returns:
True if the KVCache is generated and saved successfully, false otherwise
-
inline metrics::LLMPrefillMetrics const &getPrefillMetrics() const#
Get LLM prefill stage metrics.
- Returns:
Reference to prefill metrics
- inline metrics::LLMGenerationMetrics const &getGenerationMetrics(
Get LLM generation stage metrics.
- Returns:
Reference to generation metrics
-
inline metrics::MultimodalMetrics getMultimodalMetrics() const#
Get multimodal metrics (returns empty metrics if no multimodal runner)
- Returns:
Multimodal metrics, or empty metrics if no multimodal runner is available
-
struct SystemPromptKVCache#
Structure to hold cached system prompt and its KV cache.